Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import java.io.BufferedInputStream;
020import java.io.BufferedReader;
021import java.io.File;
022import java.io.IOException;
023import java.io.InputStream;
024import java.io.InputStreamReader;
025import java.io.Reader;
026import java.io.StringReader;
027import java.net.HttpURLConnection;
028import java.net.URL;
029import java.net.URLConnection;
030import java.nio.charset.Charset;
031import java.nio.charset.StandardCharsets;
032import java.nio.file.Files;
033import java.nio.file.Path;
034import java.text.MessageFormat;
035import java.util.Locale;
036import java.util.Objects;
037import java.util.regex.Matcher;
038import java.util.regex.Pattern;
039
040import org.apache.commons.io.ByteOrderMark;
041import org.apache.commons.io.Charsets;
042import org.apache.commons.io.IOUtils;
043import org.apache.commons.io.build.AbstractStreamBuilder;
044import org.apache.commons.io.function.IOConsumer;
045
046/**
047 * Character stream that handles all the necessary Voodoo to figure out the charset encoding of the XML document within the stream.
048 * <p>
049 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream.
050 * </p>
051 * <p>
052 * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the document as a valid XML. This is not 100%
053 * true, but it's close enough (UTF-8 BOM is not handled by all parsers right now, XmlStreamReader handles it and things work in all parsers).
054 * </p>
055 * <p>
056 * The XmlStreamReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering a wide set of constructors.
057 * </p>
058 * <p>
059 * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for a script (following HTTP MIME and XML
060 * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a href="https://diveintomark.org/archives/2004/02/13/xml-media-types">
061 * Determining the character encoding of a feed</a>.
062 * </p>
063 * <p>
064 * To build an instance, use {@link Builder}.
065 * </p>
066 * <p>
067 * Originally developed for <a href="https://rome.dev.java.net">ROME</a> under Apache License 2.0.
068 * </p>
069 *
070 * @see Builder
071 * @see org.apache.commons.io.output.XmlStreamWriter
072 * @since 2.0
073 */
074public class XmlStreamReader extends Reader {
075
076    // @formatter:off
077    /**
078     * Builds a new {@link XmlStreamReader}.
079     *
080     * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
081     * <p>
082     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
083     * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
084     * </p>
085     * <p>
086     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
087     * </p>
088     * <p>
089     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
090     * </p>
091     * <p>
092     * Else if the XML prolog had a charset encoding that encoding is used.
093     * </p>
094     * <p>
095     * Else if the content type had a charset encoding that encoding is used.
096     * </p>
097     * <p>
098     * Else 'UTF-8' is used.
099     * </p>
100     * <p>
101     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
102     * </p>
103     * <p>
104     * For example:
105     * </p>
106     *
107     * <pre>{@code
108     * XmlStreamReader r = XmlStreamReader.builder()
109     *   .setPath(path)
110     *   .setCharset(StandardCharsets.UTF_8)
111     *   .get();
112     * }
113     * </pre>
114     *
115     * @see #get()
116     * @since 2.12.0
117     */
118    // @formatter:on
119    public static class Builder extends AbstractStreamBuilder<XmlStreamReader, Builder> {
120
121        private boolean nullCharset = true;
122        private boolean lenient = true;
123        private String httpContentType;
124
125        /**
126         * Constructs a new builder of {@link XmlStreamReader}.
127         */
128        public Builder() {
129            // empty
130        }
131
132        /**
133         * Builds a new {@link XmlStreamReader}.
134         * <p>
135         * You must set an aspect that supports {@link #getInputStream()}, otherwise, this method throws an exception.
136         * </p>
137         * <p>
138         * This builder uses the following aspects:
139         * </p>
140         * <ul>
141         * <li>{@link #getInputStream()}</li>
142         * <li>{@link #getCharset()}</li>
143         * <li>lenient</li>
144         * <li>httpContentType</li>
145         * </ul>
146         *
147         * @return a new instance.
148         * @throws IllegalStateException         if the {@code origin} is {@code null}.
149         * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
150         * @throws IOException                   if an I/O error occurs converting to an {@link InputStream} using {@link #getInputStream()}.
151         * @throws XmlStreamReaderException thrown if the Charset encoding could not be determined according to the specification.
152         * @see #getInputStream()
153         * @see #getUnchecked()
154         */
155        @Override
156        public XmlStreamReader get() throws IOException {
157            final String defaultEncoding = nullCharset ? null : getCharset().name();
158            // @formatter:off
159            return httpContentType == null
160                    ? new XmlStreamReader(getInputStream(), lenient, defaultEncoding)
161                    : new XmlStreamReader(getInputStream(), httpContentType, lenient, defaultEncoding);
162            // @formatter:on
163        }
164
165        @Override
166        public Builder setCharset(final Charset charset) {
167            nullCharset = charset == null;
168            return super.setCharset(charset);
169        }
170
171        @Override
172        public Builder setCharset(final String charset) {
173            nullCharset = charset == null;
174            return super.setCharset(Charsets.toCharset(charset, getCharsetDefault()));
175        }
176
177        /**
178         * Sets the HTTP content type.
179         *
180         * @param httpContentType the HTTP content type.
181         * @return {@code this} instance.
182         */
183        public Builder setHttpContentType(final String httpContentType) {
184            this.httpContentType = httpContentType;
185            return this;
186        }
187
188        /**
189         * Sets the lenient toggle.
190         *
191         * @param lenient the lenient toggle.
192         * @return {@code this} instance.
193         */
194        public Builder setLenient(final boolean lenient) {
195            this.lenient = lenient;
196            return this;
197        }
198
199    }
200
201    private static final String UTF_8 = StandardCharsets.UTF_8.name();
202
203    private static final String US_ASCII = StandardCharsets.US_ASCII.name();
204
205    private static final String UTF_16BE = StandardCharsets.UTF_16BE.name();
206
207    private static final String UTF_16LE = StandardCharsets.UTF_16LE.name();
208
209    private static final String UTF_32BE = "UTF-32BE";
210
211    private static final String UTF_32LE = "UTF-32LE";
212
213    private static final String UTF_16 = StandardCharsets.UTF_16.name();
214
215    private static final String UTF_32 = "UTF-32";
216
217    private static final String EBCDIC = "CP1047";
218
219    private static final ByteOrderMark[] BOMS = { ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE,
220            ByteOrderMark.UTF_32LE };
221
222    /** UTF_16LE and UTF_32LE have the same two starting BOM bytes. */
223    private static final ByteOrderMark[] XML_GUESS_BYTES = { new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D),
224            new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00),
225            new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D),
226            new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00),
227            new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) };
228
229    private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=[\"']?([.[^; \"']]*)[\"']?");
230
231    /**
232     * Pattern capturing the encoding of the <a href="https://www.w3.org/TR/REC-xml/#sec-pi">{@code 'xml'} processing instruction</a>.
233     * <p>
234     * See also the <a href="https://www.w3.org/TR/2008/REC-xml-20081126/#NT-EncName">NT-EncName</a> XML specification.
235     * </p>
236     * <p>
237     * Note the documented pattern is:
238     * </p>
239     * <pre>
240     * EncName   ::=   [A-Za-z] ([A-Za-z0-9._] | '-')*
241     * </pre>
242     * <p>
243     * However this does not match all the aliases that are supported by Java. For example, {@code '437'}, {@code 'ISO_8859-1:1987'} and
244     * {@code 'ebcdic-de-273+euro'}.
245     * </p>
246     */
247    public static final Pattern ENCODING_PATTERN = Pattern.compile(
248    // @formatter:off
249            "^<\\?xml\\s+"
250            + "(?:version\\s*=\\s*(?:(?:\"1\\.[0-9]+\")|(?:'1.[0-9]+'))\\s+)??"
251            + "encoding\\s*=\\s*"
252            + "((?:\"[A-Za-z0-9][A-Za-z0-9._+:-]*\")"  // double-quoted
253            +  "|(?:'[A-Za-z0-9][A-Za-z0-9._+:-]*'))", // single-quoted
254            Pattern.MULTILINE);
255    // @formatter:on
256
257    private static final String RAW_EX_1 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
258
259    private static final String RAW_EX_2 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";
260
261    private static final String HTTP_EX_1 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be null";
262
263    private static final String HTTP_EX_2 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";
264
265    private static final String HTTP_EX_3 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Illegal MIME";
266
267    /**
268     * Constructs a new {@link Builder}.
269     *
270     * @return a new {@link Builder}.
271     * @since 2.12.0
272     */
273    public static Builder builder() {
274        return new Builder();
275    }
276
277    /**
278     * Gets the charset parameter value, {@code null} if not present, {@code null} if httpContentType is {@code null}.
279     *
280     * @param httpContentType the HTTP content type
281     * @return The content type encoding (upcased)
282     */
283    static String getContentTypeEncoding(final String httpContentType) {
284        String encoding = null;
285        if (httpContentType != null) {
286            final int i = httpContentType.indexOf(";");
287            if (i > -1) {
288                final String postMime = httpContentType.substring(i + 1);
289                final Matcher m = CHARSET_PATTERN.matcher(postMime);
290                encoding = m.find() ? m.group(1) : null;
291                encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null;
292            }
293        }
294        return encoding;
295    }
296
297    /**
298     * Gets the MIME type or {@code null} if httpContentType is {@code null}.
299     *
300     * @param httpContentType the HTTP content type
301     * @return The mime content type
302     */
303    static String getContentTypeMime(final String httpContentType) {
304        String mime = null;
305        if (httpContentType != null) {
306            final int i = httpContentType.indexOf(";");
307            mime = i >= 0 ? httpContentType.substring(0, i) : httpContentType;
308            mime = mime.trim();
309        }
310        return mime;
311    }
312
313    /**
314     * Gets the encoding declared in the <?xml encoding=...?>, {@code null} if none.
315     *
316     * @param inputStream InputStream to create the reader from.
317     * @param guessedEnc  guessed encoding
318     * @return the encoding declared in the <?xml encoding=...?>
319     * @throws IOException thrown if there is a problem reading the stream.
320     */
321    private static String getXmlProlog(final InputStream inputStream, final String guessedEnc) throws IOException {
322        String encoding = null;
323        if (guessedEnc != null) {
324            final byte[] bytes = IOUtils.byteArray();
325            inputStream.mark(IOUtils.DEFAULT_BUFFER_SIZE);
326            int offset = 0;
327            int max = IOUtils.DEFAULT_BUFFER_SIZE;
328            int c = inputStream.read(bytes, offset, max);
329            int firstGT = -1;
330            String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning)
331            while (c != -1 && firstGT == -1 && offset < IOUtils.DEFAULT_BUFFER_SIZE) {
332                offset += c;
333                max -= c;
334                c = inputStream.read(bytes, offset, max);
335                xmlProlog = new String(bytes, 0, offset, guessedEnc);
336                firstGT = xmlProlog.indexOf('>');
337            }
338            if (firstGT == -1) {
339                if (c == -1) {
340                    throw new IOException("Unexpected end of XML stream");
341                }
342                throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes");
343            }
344            final int bytesRead = offset;
345            if (bytesRead > 0) {
346                inputStream.reset();
347                final BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1)));
348                final StringBuilder prolog = new StringBuilder();
349                IOConsumer.forEach(bReader.lines(), l -> prolog.append(l).append(' '));
350                final Matcher m = ENCODING_PATTERN.matcher(prolog);
351                if (m.find()) {
352                    encoding = m.group(1).toUpperCase(Locale.ROOT);
353                    encoding = encoding.substring(1, encoding.length() - 1);
354                }
355            }
356        }
357        return encoding;
358    }
359
360    /**
361     * Tests if the MIME type belongs to the APPLICATION XML family.
362     *
363     * @param mime The mime type
364     * @return true if the mime type belongs to the APPLICATION XML family, otherwise false
365     */
366    static boolean isAppXml(final String mime) {
367        return mime != null && (mime.equals("application/xml") || mime.equals("application/xml-dtd") || mime.equals("application/xml-external-parsed-entity")
368                || mime.startsWith("application/") && mime.endsWith("+xml"));
369    }
370
371    /**
372     * Tests if the MIME type belongs to the TEXT XML family.
373     *
374     * @param mime The mime type
375     * @return true if the mime type belongs to the TEXT XML family, otherwise false
376     */
377    static boolean isTextXml(final String mime) {
378        return mime != null && (mime.equals("text/xml") || mime.equals("text/xml-external-parsed-entity") || mime.startsWith("text/") && mime.endsWith("+xml"));
379    }
380
381    private final Reader reader;
382
383    private final String encoding;
384
385    private final String defaultEncoding;
386
387    /**
388     * Constructs a Reader for a File.
389     * <p>
390     * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
391     * </p>
392     * <p>
393     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
394     * </p>
395     *
396     * @param file File to create a Reader from.
397     * @throws NullPointerException if the input is {@code null}.
398     * @throws IOException          thrown if there is a problem reading the file.
399     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
400     */
401    @Deprecated
402    public XmlStreamReader(final File file) throws IOException {
403        this(Objects.requireNonNull(file, "file").toPath());
404    }
405
406    /**
407     * Constructs a Reader for a raw InputStream.
408     * <p>
409     * It follows the same logic used for files.
410     * </p>
411     * <p>
412     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
413     * </p>
414     *
415     * @param inputStream InputStream to create a Reader from.
416     * @throws NullPointerException if the input stream is {@code null}.
417     * @throws IOException          thrown if there is a problem reading the stream.
418     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
419     */
420    @Deprecated
421    public XmlStreamReader(final InputStream inputStream) throws IOException {
422        this(inputStream, true);
423    }
424
425    /**
426     * Constructs a Reader for a raw InputStream.
427     * <p>
428     * It follows the same logic used for files.
429     * </p>
430     * <p>
431     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
432     * </p>
433     * <p>
434     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
435     * </p>
436     * <p>
437     * Else if the XML prolog had a charset encoding that encoding is used.
438     * </p>
439     * <p>
440     * Else if the content type had a charset encoding that encoding is used.
441     * </p>
442     * <p>
443     * Else 'UTF-8' is used.
444     * </p>
445     * <p>
446     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
447     * </p>
448     *
449     * @param inputStream InputStream to create a Reader from.
450     * @param lenient     indicates if the charset encoding detection should be relaxed.
451     * @throws NullPointerException     if the input stream is {@code null}.
452     * @throws IOException              thrown if there is a problem reading the stream.
453     * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
454     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
455     */
456    @Deprecated
457    public XmlStreamReader(final InputStream inputStream, final boolean lenient) throws IOException {
458        this(inputStream, lenient, null);
459    }
460
461    /**
462     * Constructs a Reader for a raw InputStream.
463     * <p>
464     * It follows the same logic used for files.
465     * </p>
466     * <p>
467     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
468     * </p>
469     * <p>
470     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
471     * </p>
472     * <p>
473     * Else if the XML prolog had a charset encoding that encoding is used.
474     * </p>
475     * <p>
476     * Else if the content type had a charset encoding that encoding is used.
477     * </p>
478     * <p>
479     * Else 'UTF-8' is used.
480     * </p>
481     * <p>
482     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
483     * </p>
484     *
485     * @param inputStream     InputStream to create a Reader from.
486     * @param lenient         indicates if the charset encoding detection should be relaxed.
487     * @param defaultEncoding The default encoding
488     * @throws NullPointerException     if the input stream is {@code null}.
489     * @throws IOException              thrown if there is a problem reading the stream.
490     * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
491     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
492     */
493    @Deprecated
494    @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
495    public XmlStreamReader(final InputStream inputStream, final boolean lenient, final String defaultEncoding) throws IOException {
496        this.defaultEncoding = defaultEncoding;
497        final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
498                false, BOMS);
499        final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
500        this.encoding = processHttpStream(bom, pis, lenient);
501        this.reader = new InputStreamReader(pis, encoding);
502    }
503
504    /**
505     * Constructs a Reader using an InputStream and the associated content-type header.
506     * <p>
507     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
508     * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
509     * </p>
510     * <p>
511     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
512     * </p>
513     *
514     * @param inputStream     InputStream to create the reader from.
515     * @param httpContentType content-type header to use for the resolution of the charset encoding.
516     * @throws NullPointerException if the input stream is {@code null}.
517     * @throws IOException          thrown if there is a problem reading the file.
518     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
519     */
520    @Deprecated
521    public XmlStreamReader(final InputStream inputStream, final String httpContentType) throws IOException {
522        this(inputStream, httpContentType, true);
523    }
524
525    /**
526     * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
527     * <p>
528     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
529     * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
530     * </p>
531     * <p>
532     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
533     * </p>
534     * <p>
535     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
536     * </p>
537     * <p>
538     * Else if the XML prolog had a charset encoding that encoding is used.
539     * </p>
540     * <p>
541     * Else if the content type had a charset encoding that encoding is used.
542     * </p>
543     * <p>
544     * Else 'UTF-8' is used.
545     * </p>
546     * <p>
547     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
548     * </p>
549     *
550     * @param inputStream     InputStream to create the reader from.
551     * @param httpContentType content-type header to use for the resolution of the charset encoding.
552     * @param lenient         indicates if the charset encoding detection should be relaxed.
553     * @throws NullPointerException     if the input stream is {@code null}.
554     * @throws IOException              thrown if there is a problem reading the file.
555     * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
556     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
557     */
558    @Deprecated
559    public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient) throws IOException {
560        this(inputStream, httpContentType, lenient, null);
561    }
562
563    /**
564     * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
565     * <p>
566     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
567     * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
568     * </p>
569     * <p>
570     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
571     * </p>
572     * <p>
573     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
574     * </p>
575     * <p>
576     * Else if the XML prolog had a charset encoding that encoding is used.
577     * </p>
578     * <p>
579     * Else if the content type had a charset encoding that encoding is used.
580     * </p>
581     * <p>
582     * Else 'UTF-8' is used.
583     * </p>
584     * <p>
585     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
586     * </p>
587     *
588     * @param inputStream     InputStream to create the reader from.
589     * @param httpContentType content-type header to use for the resolution of the charset encoding.
590     * @param lenient         indicates if the charset encoding detection should be relaxed.
591     * @param defaultEncoding The default encoding
592     * @throws NullPointerException     if the input stream is {@code null}.
593     * @throws IOException              thrown if there is a problem reading the file.
594     * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
595     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
596     */
597    @Deprecated
598    @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
599    public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient, final String defaultEncoding)
600            throws IOException {
601        this.defaultEncoding = defaultEncoding;
602        final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
603                false, BOMS);
604        final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
605        this.encoding = processHttpStream(bom, pis, lenient, httpContentType);
606        this.reader = new InputStreamReader(pis, encoding);
607    }
608
609    /**
610     * Constructs a Reader for a File.
611     * <p>
612     * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
613     * </p>
614     * <p>
615     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
616     * </p>
617     *
618     * @param file File to create a Reader from.
619     * @throws NullPointerException if the input is {@code null}.
620     * @throws IOException          thrown if there is a problem reading the file.
621     * @since 2.11.0
622     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
623     */
624    @Deprecated
625    @SuppressWarnings("resource") // InputStream is managed through another reader in this instance.
626    public XmlStreamReader(final Path file) throws IOException {
627        this(Files.newInputStream(Objects.requireNonNull(file, "file")));
628    }
629
630    /**
631     * Constructs a Reader using the InputStream of a URL.
632     * <p>
633     * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic used for Files.
634     * </p>
635     * <p>
636     * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with content-type.
637     * </p>
638     * <p>
639     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
640     * </p>
641     *
642     * @param url URL to create a Reader from.
643     * @throws NullPointerException if the input is {@code null}.
644     * @throws IOException          thrown if there is a problem reading the stream of the URL.
645     */
646    public XmlStreamReader(final URL url) throws IOException {
647        this(Objects.requireNonNull(url, "url").openConnection(), null);
648    }
649
650    /**
651     * Constructs a Reader using the InputStream of a URLConnection.
652     * <p>
653     * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data it uses the same logic used for files.
654     * </p>
655     * <p>
656     * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with
657     * content-type.
658     * </p>
659     * <p>
660     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
661     * </p>
662     *
663     * @param urlConnection   URLConnection to create a Reader from.
664     * @param defaultEncoding The default encoding
665     * @throws NullPointerException if the input is {@code null}.
666     * @throws IOException          thrown if there is a problem reading the stream of the URLConnection.
667     */
668    public XmlStreamReader(final URLConnection urlConnection, final String defaultEncoding) throws IOException {
669        Objects.requireNonNull(urlConnection, "urlConnection");
670        this.defaultEncoding = defaultEncoding;
671        final boolean lenient = true;
672        final String contentType = urlConnection.getContentType();
673        final InputStream inputStream = urlConnection.getInputStream();
674        @SuppressWarnings("resource") // managed by the InputStreamReader tracked by this instance
675        // @formatter:off
676        final BOMInputStream bomInput = BOMInputStream.builder()
677            .setInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE))
678            .setInclude(false)
679            .setByteOrderMarks(BOMS)
680            .get();
681        @SuppressWarnings("resource")
682        final BOMInputStream piInput = BOMInputStream.builder()
683            .setInputStream(new BufferedInputStream(bomInput, IOUtils.DEFAULT_BUFFER_SIZE))
684            .setInclude(true)
685            .setByteOrderMarks(XML_GUESS_BYTES)
686            .get();
687        // @formatter:on
688        if (urlConnection instanceof HttpURLConnection || contentType != null) {
689            this.encoding = processHttpStream(bomInput, piInput, lenient, contentType);
690        } else {
691            this.encoding = processHttpStream(bomInput, piInput, lenient);
692        }
693        this.reader = new InputStreamReader(piInput, encoding);
694    }
695
696    /**
697     * Calculates the HTTP encoding.
698     * @param bomEnc          BOM encoding
699     * @param xmlGuessEnc     XML Guess encoding
700     * @param xmlEnc          XML encoding
701     * @param lenient         indicates if the charset encoding detection should be relaxed.
702     * @param httpContentType The HTTP content type
703     * @return the HTTP encoding
704     * @throws IOException thrown if there is a problem reading the stream.
705     */
706    String calculateHttpEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc, final boolean lenient, final String httpContentType)
707            throws IOException {
708
709        // Lenient and has XML encoding
710        if (lenient && xmlEnc != null) {
711            return xmlEnc;
712        }
713
714        // Determine mime/encoding content types from HTTP Content Type
715        final String cTMime = getContentTypeMime(httpContentType);
716        final String cTEnc = getContentTypeEncoding(httpContentType);
717        final boolean appXml = isAppXml(cTMime);
718        final boolean textXml = isTextXml(cTMime);
719
720        // Mime type NOT "application/xml" or "text/xml"
721        if (!appXml && !textXml) {
722            final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
723            throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
724        }
725
726        // No content type encoding
727        if (cTEnc == null) {
728            if (appXml) {
729                return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
730            }
731            return defaultEncoding == null ? US_ASCII : defaultEncoding;
732        }
733
734        // UTF-16BE or UTF-16LE content type encoding
735        if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
736            if (bomEnc != null) {
737                final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
738                throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
739            }
740            return cTEnc;
741        }
742
743        // UTF-16 content type encoding
744        if (cTEnc.equals(UTF_16)) {
745            if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
746                return bomEnc;
747            }
748            final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
749            throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
750        }
751
752        // UTF-32BE or UTF-132E content type encoding
753        if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) {
754            if (bomEnc != null) {
755                final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
756                throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
757            }
758            return cTEnc;
759        }
760
761        // UTF-32 content type encoding
762        if (cTEnc.equals(UTF_32)) {
763            if (bomEnc != null && bomEnc.startsWith(UTF_32)) {
764                return bomEnc;
765            }
766            final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
767            throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
768        }
769
770        return cTEnc;
771    }
772
773    /**
774     * Calculate the raw encoding.
775     *
776     * @param bomEnc      BOM encoding
777     * @param xmlGuessEnc XML Guess encoding
778     * @param xmlEnc      XML encoding
779     * @return the raw encoding
780     * @throws IOException thrown if there is a problem reading the stream.
781     */
782    String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc) throws IOException {
783
784        // BOM is Null
785        if (bomEnc == null) {
786            if (xmlGuessEnc == null || xmlEnc == null) {
787                return defaultEncoding == null ? UTF_8 : defaultEncoding;
788            }
789            if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
790                return xmlGuessEnc;
791            }
792            return xmlEnc;
793        }
794
795        // BOM is UTF-8
796        if (bomEnc.equals(UTF_8)) {
797            if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
798                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
799                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
800            }
801            if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
802                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
803                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
804            }
805            return bomEnc;
806        }
807
808        // BOM is UTF-16BE or UTF-16LE
809        if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
810            if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
811                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
812                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
813            }
814            if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
815                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
816                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
817            }
818            return bomEnc;
819        }
820
821        // BOM is UTF-32BE or UTF-32LE
822        if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) {
823            if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
824                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
825                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
826            }
827            if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) {
828                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
829                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
830            }
831            return bomEnc;
832        }
833
834        // BOM is something else
835        final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc);
836        throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
837    }
838
839    /**
840     * Closes the XmlStreamReader stream.
841     *
842     * @throws IOException thrown if there was a problem closing the stream.
843     */
844    @Override
845    public void close() throws IOException {
846        reader.close();
847    }
848
849    /**
850     * Does lenient detection.
851     *
852     * @param httpContentType content-type header to use for the resolution of the charset encoding.
853     * @param ex              The thrown exception
854     * @return the encoding
855     * @throws IOException thrown if there is a problem reading the stream.
856     */
857    private String doLenientDetection(String httpContentType, XmlStreamReaderException ex) throws IOException {
858        if (httpContentType != null && httpContentType.startsWith("text/html")) {
859            httpContentType = httpContentType.substring("text/html".length());
860            httpContentType = "text/xml" + httpContentType;
861            try {
862                return calculateHttpEncoding(ex.getBomEncoding(), ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true, httpContentType);
863            } catch (final XmlStreamReaderException ex2) {
864                ex = ex2;
865            }
866        }
867        String encoding = ex.getXmlEncoding();
868        if (encoding == null) {
869            encoding = ex.getContentTypeEncoding();
870        }
871        if (encoding == null) {
872            encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
873        }
874        return encoding;
875    }
876
877    /**
878     * Gets the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on content-type are not adequate.
879     * <p>
880     * If it is {@code null} the content-type based rules are used.
881     * </p>
882     *
883     * @return the default encoding to use.
884     */
885    public String getDefaultEncoding() {
886        return defaultEncoding;
887    }
888
889    /**
890     * Gets the charset encoding of the XmlStreamReader.
891     *
892     * @return charset encoding.
893     */
894    public String getEncoding() {
895        return encoding;
896    }
897
898    /**
899     * Process the raw stream.
900     *
901     * @param bomInput     BOMInputStream to detect byte order marks
902     * @param piInput     BOMInputStream to guess XML encoding
903     * @param lenient indicates if the charset encoding detection should be relaxed.
904     * @return the encoding to be used
905     * @throws IOException thrown if there is a problem reading the stream.
906     */
907    private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient) throws IOException {
908        final String bomEnc = bomInput.getBOMCharsetName();
909        final String xmlGuessEnc = piInput.getBOMCharsetName();
910        final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
911        try {
912            return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
913        } catch (final XmlStreamReaderException ex) {
914            if (lenient) {
915                return doLenientDetection(null, ex);
916            }
917            throw ex;
918        }
919    }
920
921    /**
922     * Processes an HTTP stream.
923     *
924     * @param bomInput        BOMInputStream to detect byte order marks
925     * @param piInput         BOMInputStream to guess XML encoding
926     * @param lenient         indicates if the charset encoding detection should be relaxed.
927     * @param httpContentType The HTTP content type
928     * @return the encoding to be used
929     * @throws IOException thrown if there is a problem reading the stream.
930     */
931    private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient, final String httpContentType)
932            throws IOException {
933        final String bomEnc = bomInput.getBOMCharsetName();
934        final String xmlGuessEnc = piInput.getBOMCharsetName();
935        final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
936        try {
937            return calculateHttpEncoding(bomEnc, xmlGuessEnc, xmlEnc, lenient, httpContentType);
938        } catch (final XmlStreamReaderException ex) {
939            if (lenient) {
940                return doLenientDetection(httpContentType, ex);
941            }
942            throw ex;
943        }
944    }
945
946    /**
947     * Reads the underlying reader's {@code read(char[], int, int)} method.
948     *
949     * @param buf    the buffer to read the characters into
950     * @param offset The start offset
951     * @param len    The number of bytes to read
952     * @return the number of characters read or -1 if the end of stream
953     * @throws IOException if an I/O error occurs.
954     */
955    @Override
956    public int read(final char[] buf, final int offset, final int len) throws IOException {
957        return reader.read(buf, offset, len);
958    }
959
960}