Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import java.io.BufferedInputStream;
020import java.io.BufferedReader;
021import java.io.File;
022import java.io.IOException;
023import java.io.InputStream;
024import java.io.InputStreamReader;
025import java.io.Reader;
026import java.io.StringReader;
027import java.net.HttpURLConnection;
028import java.net.URL;
029import java.net.URLConnection;
030import java.nio.charset.Charset;
031import java.nio.charset.StandardCharsets;
032import java.nio.file.Files;
033import java.nio.file.Path;
034import java.text.MessageFormat;
035import java.util.Locale;
036import java.util.Objects;
037import java.util.regex.Matcher;
038import java.util.regex.Pattern;
039
040import org.apache.commons.io.ByteOrderMark;
041import org.apache.commons.io.Charsets;
042import org.apache.commons.io.IOUtils;
043import org.apache.commons.io.build.AbstractStreamBuilder;
044import org.apache.commons.io.function.IOConsumer;
045import org.apache.commons.io.output.XmlStreamWriter;
046
047/**
048 * Character stream that handles all the necessary Voodoo to figure out the charset encoding of the XML document within the stream.
049 * <p>
050 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream.
051 * </p>
052 * <p>
053 * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the document as a valid XML. This is not 100%
054 * true, but it's close enough (UTF-8 BOM is not handled by all parsers right now, XmlStreamReader handles it and things work in all parsers).
055 * </p>
056 * <p>
057 * The XmlStreamReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering a wide set of constructors.
058 * </p>
059 * <p>
060 * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for a script (following HTTP MIME and XML
061 * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a href="https://diveintomark.org/archives/2004/02/13/xml-media-types">
062 * Determining the character encoding of a feed</a>.
063 * </p>
064 * <p>
065 * To build an instance, use {@link Builder}.
066 * </p>
067 * <p>
068 * Originally developed for <a href="https://rome.dev.java.net">ROME</a> under Apache License 2.0.
069 * </p>
070 *
071 * @see Builder
072 * @see org.apache.commons.io.output.XmlStreamWriter
073 * @since 2.0
074 */
075public class XmlStreamReader extends Reader {
076
077    // @formatter:off
078    /**
079     * Builds a new {@link XmlStreamWriter}.
080     *
081     * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
082     * <p>
083     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
084     * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
085     * </p>
086     * <p>
087     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
088     * </p>
089     * <p>
090     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
091     * </p>
092     * <p>
093     * Else if the XML prolog had a charset encoding that encoding is used.
094     * </p>
095     * <p>
096     * Else if the content type had a charset encoding that encoding is used.
097     * </p>
098     * <p>
099     * Else 'UTF-8' is used.
100     * </p>
101     * <p>
102     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
103     * </p>
104     * <p>
105     * For example:
106     * </p>
107     *
108     * <pre>{@code
109     * XmlStreamReader r = XmlStreamReader.builder()
110     *   .setPath(path)
111     *   .setCharset(StandardCharsets.UTF_8)
112     *   .get();
113     * }
114     * </pre>
115     *
116     * @see #get()
117     * @since 2.12.0
118     */
119    // @formatter:on
120    public static class Builder extends AbstractStreamBuilder<XmlStreamReader, Builder> {
121
122        private boolean nullCharset = true;
123        private boolean lenient = true;
124        private String httpContentType;
125
126        /**
127         * Constructs a new builder of {@link XmlStreamReader}.
128         */
129        public Builder() {
130            // empty
131        }
132
133        /**
134         * Builds a new {@link XmlStreamWriter}.
135         * <p>
136         * You must set an aspect that supports {@link #getInputStream()}, otherwise, this method throws an exception.
137         * </p>
138         * <p>
139         * This builder uses the following aspects:
140         * </p>
141         * <ul>
142         * <li>{@link #getInputStream()}</li>
143         * <li>{@link #getCharset()}</li>
144         * <li>lenient</li>
145         * <li>httpContentType</li>
146         * </ul>
147         *
148         * @return a new instance.
149         * @throws IllegalStateException         if the {@code origin} is {@code null}.
150         * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
151         * @throws IOException                   if an I/O error occurs converting to an {@link InputStream} using {@link #getInputStream()}.
152         * @throws XmlStreamReaderException thrown if the Charset encoding could not be determined according to the specification.
153         * @see #getInputStream()
154         * @see #getUnchecked()
155         */
156        @Override
157        public XmlStreamReader get() throws IOException {
158            final String defaultEncoding = nullCharset ? null : getCharset().name();
159            // @formatter:off
160            return httpContentType == null
161                    ? new XmlStreamReader(getInputStream(), lenient, defaultEncoding)
162                    : new XmlStreamReader(getInputStream(), httpContentType, lenient, defaultEncoding);
163            // @formatter:on
164        }
165
166        @Override
167        public Builder setCharset(final Charset charset) {
168            nullCharset = charset == null;
169            return super.setCharset(charset);
170        }
171
172        @Override
173        public Builder setCharset(final String charset) {
174            nullCharset = charset == null;
175            return super.setCharset(Charsets.toCharset(charset, getCharsetDefault()));
176        }
177
178        /**
179         * Sets the HTTP content type.
180         *
181         * @param httpContentType the HTTP content type.
182         * @return {@code this} instance.
183         */
184        public Builder setHttpContentType(final String httpContentType) {
185            this.httpContentType = httpContentType;
186            return this;
187        }
188
189        /**
190         * Sets the lenient toggle.
191         *
192         * @param lenient the lenient toggle.
193         * @return {@code this} instance.
194         */
195        public Builder setLenient(final boolean lenient) {
196            this.lenient = lenient;
197            return this;
198        }
199
200    }
201
202    private static final String UTF_8 = StandardCharsets.UTF_8.name();
203
204    private static final String US_ASCII = StandardCharsets.US_ASCII.name();
205
206    private static final String UTF_16BE = StandardCharsets.UTF_16BE.name();
207
208    private static final String UTF_16LE = StandardCharsets.UTF_16LE.name();
209
210    private static final String UTF_32BE = "UTF-32BE";
211
212    private static final String UTF_32LE = "UTF-32LE";
213
214    private static final String UTF_16 = StandardCharsets.UTF_16.name();
215
216    private static final String UTF_32 = "UTF-32";
217
218    private static final String EBCDIC = "CP1047";
219
220    private static final ByteOrderMark[] BOMS = { ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE,
221            ByteOrderMark.UTF_32LE };
222
223    /** UTF_16LE and UTF_32LE have the same two starting BOM bytes. */
224    private static final ByteOrderMark[] XML_GUESS_BYTES = { new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D),
225            new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00),
226            new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D),
227            new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00),
228            new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) };
229
230    private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=[\"']?([.[^; \"']]*)[\"']?");
231
232    /**
233     * Pattern capturing the encoding of the <a href="https://www.w3.org/TR/REC-xml/#sec-pi">{@code 'xml'} processing instruction</a>.
234     * <p>
235     * See also the <a href="https://www.w3.org/TR/2008/REC-xml-20081126/#NT-EncName">NT-EncName</a> XML specification.
236     * </p>
237     * <p>
238     * Note the documented pattern is:
239     * </p>
240     * <pre>
241     * EncName   ::=   [A-Za-z] ([A-Za-z0-9._] | '-')*
242     * </pre>
243     * <p>
244     * However this does not match all the aliases that are supported by Java. For example, {@code '437'}, {@code 'ISO_8859-1:1987'} and
245     * {@code 'ebcdic-de-273+euro'}.
246     * </p>
247     */
248    public static final Pattern ENCODING_PATTERN = Pattern.compile(
249    // @formatter:off
250            "^<\\?xml\\s+"
251            + "(?:version\\s*=\\s*(?:(?:\"1\\.[0-9]+\")|(?:'1.[0-9]+'))\\s+)??"
252            + "encoding\\s*=\\s*"
253            + "((?:\"[A-Za-z0-9][A-Za-z0-9._+:-]*\")"  // double-quoted
254            +  "|(?:'[A-Za-z0-9][A-Za-z0-9._+:-]*'))", // single-quoted
255            Pattern.MULTILINE);
256    // @formatter:on
257
258    private static final String RAW_EX_1 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
259
260    private static final String RAW_EX_2 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";
261
262    private static final String HTTP_EX_1 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be null";
263
264    private static final String HTTP_EX_2 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";
265
266    private static final String HTTP_EX_3 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Illegal MIME";
267
268    /**
269     * Constructs a new {@link Builder}.
270     *
271     * @return a new {@link Builder}.
272     * @since 2.12.0
273     */
274    public static Builder builder() {
275        return new Builder();
276    }
277
278    /**
279     * Gets the charset parameter value, {@code null} if not present, {@code null} if httpContentType is {@code null}.
280     *
281     * @param httpContentType the HTTP content type
282     * @return The content type encoding (upcased)
283     */
284    static String getContentTypeEncoding(final String httpContentType) {
285        String encoding = null;
286        if (httpContentType != null) {
287            final int i = httpContentType.indexOf(";");
288            if (i > -1) {
289                final String postMime = httpContentType.substring(i + 1);
290                final Matcher m = CHARSET_PATTERN.matcher(postMime);
291                encoding = m.find() ? m.group(1) : null;
292                encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null;
293            }
294        }
295        return encoding;
296    }
297
298    /**
299     * Gets the MIME type or {@code null} if httpContentType is {@code null}.
300     *
301     * @param httpContentType the HTTP content type
302     * @return The mime content type
303     */
304    static String getContentTypeMime(final String httpContentType) {
305        String mime = null;
306        if (httpContentType != null) {
307            final int i = httpContentType.indexOf(";");
308            mime = i >= 0 ? httpContentType.substring(0, i) : httpContentType;
309            mime = mime.trim();
310        }
311        return mime;
312    }
313
314    /**
315     * Gets the encoding declared in the <?xml encoding=...?>, {@code null} if none.
316     *
317     * @param inputStream InputStream to create the reader from.
318     * @param guessedEnc  guessed encoding
319     * @return the encoding declared in the <?xml encoding=...?>
320     * @throws IOException thrown if there is a problem reading the stream.
321     */
322    private static String getXmlProlog(final InputStream inputStream, final String guessedEnc) throws IOException {
323        String encoding = null;
324        if (guessedEnc != null) {
325            final byte[] bytes = IOUtils.byteArray();
326            inputStream.mark(IOUtils.DEFAULT_BUFFER_SIZE);
327            int offset = 0;
328            int max = IOUtils.DEFAULT_BUFFER_SIZE;
329            int c = inputStream.read(bytes, offset, max);
330            int firstGT = -1;
331            String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning)
332            while (c != -1 && firstGT == -1 && offset < IOUtils.DEFAULT_BUFFER_SIZE) {
333                offset += c;
334                max -= c;
335                c = inputStream.read(bytes, offset, max);
336                xmlProlog = new String(bytes, 0, offset, guessedEnc);
337                firstGT = xmlProlog.indexOf('>');
338            }
339            if (firstGT == -1) {
340                if (c == -1) {
341                    throw new IOException("Unexpected end of XML stream");
342                }
343                throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes");
344            }
345            final int bytesRead = offset;
346            if (bytesRead > 0) {
347                inputStream.reset();
348                final BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1)));
349                final StringBuilder prolog = new StringBuilder();
350                IOConsumer.forEach(bReader.lines(), l -> prolog.append(l).append(' '));
351                final Matcher m = ENCODING_PATTERN.matcher(prolog);
352                if (m.find()) {
353                    encoding = m.group(1).toUpperCase(Locale.ROOT);
354                    encoding = encoding.substring(1, encoding.length() - 1);
355                }
356            }
357        }
358        return encoding;
359    }
360
361    /**
362     * Tests if the MIME type belongs to the APPLICATION XML family.
363     *
364     * @param mime The mime type
365     * @return true if the mime type belongs to the APPLICATION XML family, otherwise false
366     */
367    static boolean isAppXml(final String mime) {
368        return mime != null && (mime.equals("application/xml") || mime.equals("application/xml-dtd") || mime.equals("application/xml-external-parsed-entity")
369                || mime.startsWith("application/") && mime.endsWith("+xml"));
370    }
371
372    /**
373     * Tests if the MIME type belongs to the TEXT XML family.
374     *
375     * @param mime The mime type
376     * @return true if the mime type belongs to the TEXT XML family, otherwise false
377     */
378    static boolean isTextXml(final String mime) {
379        return mime != null && (mime.equals("text/xml") || mime.equals("text/xml-external-parsed-entity") || mime.startsWith("text/") && mime.endsWith("+xml"));
380    }
381
382    private final Reader reader;
383
384    private final String encoding;
385
386    private final String defaultEncoding;
387
388    /**
389     * Constructs a Reader for a File.
390     * <p>
391     * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
392     * </p>
393     * <p>
394     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
395     * </p>
396     *
397     * @param file File to create a Reader from.
398     * @throws NullPointerException if the input is {@code null}.
399     * @throws IOException          thrown if there is a problem reading the file.
400     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
401     */
402    @Deprecated
403    public XmlStreamReader(final File file) throws IOException {
404        this(Objects.requireNonNull(file, "file").toPath());
405    }
406
407    /**
408     * Constructs a Reader for a raw InputStream.
409     * <p>
410     * It follows the same logic used for files.
411     * </p>
412     * <p>
413     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
414     * </p>
415     *
416     * @param inputStream InputStream to create a Reader from.
417     * @throws NullPointerException if the input stream is {@code null}.
418     * @throws IOException          thrown if there is a problem reading the stream.
419     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
420     */
421    @Deprecated
422    public XmlStreamReader(final InputStream inputStream) throws IOException {
423        this(inputStream, true);
424    }
425
426    /**
427     * Constructs a Reader for a raw InputStream.
428     * <p>
429     * It follows the same logic used for files.
430     * </p>
431     * <p>
432     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
433     * </p>
434     * <p>
435     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
436     * </p>
437     * <p>
438     * Else if the XML prolog had a charset encoding that encoding is used.
439     * </p>
440     * <p>
441     * Else if the content type had a charset encoding that encoding is used.
442     * </p>
443     * <p>
444     * Else 'UTF-8' is used.
445     * </p>
446     * <p>
447     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
448     * </p>
449     *
450     * @param inputStream InputStream to create a Reader from.
451     * @param lenient     indicates if the charset encoding detection should be relaxed.
452     * @throws NullPointerException     if the input stream is {@code null}.
453     * @throws IOException              thrown if there is a problem reading the stream.
454     * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
455     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
456     */
457    @Deprecated
458    public XmlStreamReader(final InputStream inputStream, final boolean lenient) throws IOException {
459        this(inputStream, lenient, null);
460    }
461
462    /**
463     * Constructs a Reader for a raw InputStream.
464     * <p>
465     * It follows the same logic used for files.
466     * </p>
467     * <p>
468     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
469     * </p>
470     * <p>
471     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
472     * </p>
473     * <p>
474     * Else if the XML prolog had a charset encoding that encoding is used.
475     * </p>
476     * <p>
477     * Else if the content type had a charset encoding that encoding is used.
478     * </p>
479     * <p>
480     * Else 'UTF-8' is used.
481     * </p>
482     * <p>
483     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
484     * </p>
485     *
486     * @param inputStream     InputStream to create a Reader from.
487     * @param lenient         indicates if the charset encoding detection should be relaxed.
488     * @param defaultEncoding The default encoding
489     * @throws NullPointerException     if the input stream is {@code null}.
490     * @throws IOException              thrown if there is a problem reading the stream.
491     * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
492     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
493     */
494    @Deprecated
495    @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
496    public XmlStreamReader(final InputStream inputStream, final boolean lenient, final String defaultEncoding) throws IOException {
497        this.defaultEncoding = defaultEncoding;
498        final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
499                false, BOMS);
500        final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
501        this.encoding = processHttpStream(bom, pis, lenient);
502        this.reader = new InputStreamReader(pis, encoding);
503    }
504
505    /**
506     * Constructs a Reader using an InputStream and the associated content-type header.
507     * <p>
508     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
509     * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
510     * </p>
511     * <p>
512     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
513     * </p>
514     *
515     * @param inputStream     InputStream to create the reader from.
516     * @param httpContentType content-type header to use for the resolution of the charset encoding.
517     * @throws NullPointerException if the input stream is {@code null}.
518     * @throws IOException          thrown if there is a problem reading the file.
519     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
520     */
521    @Deprecated
522    public XmlStreamReader(final InputStream inputStream, final String httpContentType) throws IOException {
523        this(inputStream, httpContentType, true);
524    }
525
526    /**
527     * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
528     * <p>
529     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
530     * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
531     * </p>
532     * <p>
533     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
534     * </p>
535     * <p>
536     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
537     * </p>
538     * <p>
539     * Else if the XML prolog had a charset encoding that encoding is used.
540     * </p>
541     * <p>
542     * Else if the content type had a charset encoding that encoding is used.
543     * </p>
544     * <p>
545     * Else 'UTF-8' is used.
546     * </p>
547     * <p>
548     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
549     * </p>
550     *
551     * @param inputStream     InputStream to create the reader from.
552     * @param httpContentType content-type header to use for the resolution of the charset encoding.
553     * @param lenient         indicates if the charset encoding detection should be relaxed.
554     * @throws NullPointerException     if the input stream is {@code null}.
555     * @throws IOException              thrown if there is a problem reading the file.
556     * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
557     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
558     */
559    @Deprecated
560    public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient) throws IOException {
561        this(inputStream, httpContentType, lenient, null);
562    }
563
564    /**
565     * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
566     * <p>
567     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
568     * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
569     * </p>
570     * <p>
571     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
572     * </p>
573     * <p>
574     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
575     * </p>
576     * <p>
577     * Else if the XML prolog had a charset encoding that encoding is used.
578     * </p>
579     * <p>
580     * Else if the content type had a charset encoding that encoding is used.
581     * </p>
582     * <p>
583     * Else 'UTF-8' is used.
584     * </p>
585     * <p>
586     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
587     * </p>
588     *
589     * @param inputStream     InputStream to create the reader from.
590     * @param httpContentType content-type header to use for the resolution of the charset encoding.
591     * @param lenient         indicates if the charset encoding detection should be relaxed.
592     * @param defaultEncoding The default encoding
593     * @throws NullPointerException     if the input stream is {@code null}.
594     * @throws IOException              thrown if there is a problem reading the file.
595     * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
596     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
597     */
598    @Deprecated
599    @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
600    public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient, final String defaultEncoding)
601            throws IOException {
602        this.defaultEncoding = defaultEncoding;
603        final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
604                false, BOMS);
605        final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
606        this.encoding = processHttpStream(bom, pis, lenient, httpContentType);
607        this.reader = new InputStreamReader(pis, encoding);
608    }
609
610    /**
611     * Constructs a Reader for a File.
612     * <p>
613     * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
614     * </p>
615     * <p>
616     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
617     * </p>
618     *
619     * @param file File to create a Reader from.
620     * @throws NullPointerException if the input is {@code null}.
621     * @throws IOException          thrown if there is a problem reading the file.
622     * @since 2.11.0
623     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
624     */
625    @Deprecated
626    @SuppressWarnings("resource") // InputStream is managed through another reader in this instance.
627    public XmlStreamReader(final Path file) throws IOException {
628        this(Files.newInputStream(Objects.requireNonNull(file, "file")));
629    }
630
631    /**
632     * Constructs a Reader using the InputStream of a URL.
633     * <p>
634     * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic used for Files.
635     * </p>
636     * <p>
637     * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with content-type.
638     * </p>
639     * <p>
640     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
641     * </p>
642     *
643     * @param url URL to create a Reader from.
644     * @throws NullPointerException if the input is {@code null}.
645     * @throws IOException          thrown if there is a problem reading the stream of the URL.
646     */
647    public XmlStreamReader(final URL url) throws IOException {
648        this(Objects.requireNonNull(url, "url").openConnection(), null);
649    }
650
651    /**
652     * Constructs a Reader using the InputStream of a URLConnection.
653     * <p>
654     * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data it uses the same logic used for files.
655     * </p>
656     * <p>
657     * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with
658     * content-type.
659     * </p>
660     * <p>
661     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
662     * </p>
663     *
664     * @param urlConnection   URLConnection to create a Reader from.
665     * @param defaultEncoding The default encoding
666     * @throws NullPointerException if the input is {@code null}.
667     * @throws IOException          thrown if there is a problem reading the stream of the URLConnection.
668     */
669    public XmlStreamReader(final URLConnection urlConnection, final String defaultEncoding) throws IOException {
670        Objects.requireNonNull(urlConnection, "urlConnection");
671        this.defaultEncoding = defaultEncoding;
672        final boolean lenient = true;
673        final String contentType = urlConnection.getContentType();
674        final InputStream inputStream = urlConnection.getInputStream();
675        @SuppressWarnings("resource") // managed by the InputStreamReader tracked by this instance
676        // @formatter:off
677        final BOMInputStream bomInput = BOMInputStream.builder()
678            .setInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE))
679            .setInclude(false)
680            .setByteOrderMarks(BOMS)
681            .get();
682        @SuppressWarnings("resource")
683        final BOMInputStream piInput = BOMInputStream.builder()
684            .setInputStream(new BufferedInputStream(bomInput, IOUtils.DEFAULT_BUFFER_SIZE))
685            .setInclude(true)
686            .setByteOrderMarks(XML_GUESS_BYTES)
687            .get();
688        // @formatter:on
689        if (urlConnection instanceof HttpURLConnection || contentType != null) {
690            this.encoding = processHttpStream(bomInput, piInput, lenient, contentType);
691        } else {
692            this.encoding = processHttpStream(bomInput, piInput, lenient);
693        }
694        this.reader = new InputStreamReader(piInput, encoding);
695    }
696
697    /**
698     * Calculates the HTTP encoding.
699     * @param bomEnc          BOM encoding
700     * @param xmlGuessEnc     XML Guess encoding
701     * @param xmlEnc          XML encoding
702     * @param lenient         indicates if the charset encoding detection should be relaxed.
703     * @param httpContentType The HTTP content type
704     * @return the HTTP encoding
705     * @throws IOException thrown if there is a problem reading the stream.
706     */
707    String calculateHttpEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc, final boolean lenient, final String httpContentType)
708            throws IOException {
709
710        // Lenient and has XML encoding
711        if (lenient && xmlEnc != null) {
712            return xmlEnc;
713        }
714
715        // Determine mime/encoding content types from HTTP Content Type
716        final String cTMime = getContentTypeMime(httpContentType);
717        final String cTEnc = getContentTypeEncoding(httpContentType);
718        final boolean appXml = isAppXml(cTMime);
719        final boolean textXml = isTextXml(cTMime);
720
721        // Mime type NOT "application/xml" or "text/xml"
722        if (!appXml && !textXml) {
723            final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
724            throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
725        }
726
727        // No content type encoding
728        if (cTEnc == null) {
729            if (appXml) {
730                return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
731            }
732            return defaultEncoding == null ? US_ASCII : defaultEncoding;
733        }
734
735        // UTF-16BE or UTF-16LE content type encoding
736        if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
737            if (bomEnc != null) {
738                final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
739                throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
740            }
741            return cTEnc;
742        }
743
744        // UTF-16 content type encoding
745        if (cTEnc.equals(UTF_16)) {
746            if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
747                return bomEnc;
748            }
749            final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
750            throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
751        }
752
753        // UTF-32BE or UTF-132E content type encoding
754        if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) {
755            if (bomEnc != null) {
756                final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
757                throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
758            }
759            return cTEnc;
760        }
761
762        // UTF-32 content type encoding
763        if (cTEnc.equals(UTF_32)) {
764            if (bomEnc != null && bomEnc.startsWith(UTF_32)) {
765                return bomEnc;
766            }
767            final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
768            throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
769        }
770
771        return cTEnc;
772    }
773
774    /**
775     * Calculate the raw encoding.
776     *
777     * @param bomEnc      BOM encoding
778     * @param xmlGuessEnc XML Guess encoding
779     * @param xmlEnc      XML encoding
780     * @return the raw encoding
781     * @throws IOException thrown if there is a problem reading the stream.
782     */
783    String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc) throws IOException {
784
785        // BOM is Null
786        if (bomEnc == null) {
787            if (xmlGuessEnc == null || xmlEnc == null) {
788                return defaultEncoding == null ? UTF_8 : defaultEncoding;
789            }
790            if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
791                return xmlGuessEnc;
792            }
793            return xmlEnc;
794        }
795
796        // BOM is UTF-8
797        if (bomEnc.equals(UTF_8)) {
798            if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
799                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
800                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
801            }
802            if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
803                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
804                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
805            }
806            return bomEnc;
807        }
808
809        // BOM is UTF-16BE or UTF-16LE
810        if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
811            if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
812                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
813                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
814            }
815            if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
816                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
817                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
818            }
819            return bomEnc;
820        }
821
822        // BOM is UTF-32BE or UTF-32LE
823        if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) {
824            if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
825                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
826                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
827            }
828            if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) {
829                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
830                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
831            }
832            return bomEnc;
833        }
834
835        // BOM is something else
836        final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc);
837        throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
838    }
839
840    /**
841     * Closes the XmlStreamReader stream.
842     *
843     * @throws IOException thrown if there was a problem closing the stream.
844     */
845    @Override
846    public void close() throws IOException {
847        reader.close();
848    }
849
850    /**
851     * Does lenient detection.
852     *
853     * @param httpContentType content-type header to use for the resolution of the charset encoding.
854     * @param ex              The thrown exception
855     * @return the encoding
856     * @throws IOException thrown if there is a problem reading the stream.
857     */
858    private String doLenientDetection(String httpContentType, XmlStreamReaderException ex) throws IOException {
859        if (httpContentType != null && httpContentType.startsWith("text/html")) {
860            httpContentType = httpContentType.substring("text/html".length());
861            httpContentType = "text/xml" + httpContentType;
862            try {
863                return calculateHttpEncoding(ex.getBomEncoding(), ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true, httpContentType);
864            } catch (final XmlStreamReaderException ex2) {
865                ex = ex2;
866            }
867        }
868        String encoding = ex.getXmlEncoding();
869        if (encoding == null) {
870            encoding = ex.getContentTypeEncoding();
871        }
872        if (encoding == null) {
873            encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
874        }
875        return encoding;
876    }
877
878    /**
879     * Gets the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on content-type are not adequate.
880     * <p>
881     * If it is {@code null} the content-type based rules are used.
882     * </p>
883     *
884     * @return the default encoding to use.
885     */
886    public String getDefaultEncoding() {
887        return defaultEncoding;
888    }
889
890    /**
891     * Gets the charset encoding of the XmlStreamReader.
892     *
893     * @return charset encoding.
894     */
895    public String getEncoding() {
896        return encoding;
897    }
898
899    /**
900     * Process the raw stream.
901     *
902     * @param bomInput     BOMInputStream to detect byte order marks
903     * @param piInput     BOMInputStream to guess XML encoding
904     * @param lenient indicates if the charset encoding detection should be relaxed.
905     * @return the encoding to be used
906     * @throws IOException thrown if there is a problem reading the stream.
907     */
908    private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient) throws IOException {
909        final String bomEnc = bomInput.getBOMCharsetName();
910        final String xmlGuessEnc = piInput.getBOMCharsetName();
911        final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
912        try {
913            return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
914        } catch (final XmlStreamReaderException ex) {
915            if (lenient) {
916                return doLenientDetection(null, ex);
917            }
918            throw ex;
919        }
920    }
921
922    /**
923     * Processes an HTTP stream.
924     *
925     * @param bomInput        BOMInputStream to detect byte order marks
926     * @param piInput         BOMInputStream to guess XML encoding
927     * @param lenient         indicates if the charset encoding detection should be relaxed.
928     * @param httpContentType The HTTP content type
929     * @return the encoding to be used
930     * @throws IOException thrown if there is a problem reading the stream.
931     */
932    private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient, final String httpContentType)
933            throws IOException {
934        final String bomEnc = bomInput.getBOMCharsetName();
935        final String xmlGuessEnc = piInput.getBOMCharsetName();
936        final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
937        try {
938            return calculateHttpEncoding(bomEnc, xmlGuessEnc, xmlEnc, lenient, httpContentType);
939        } catch (final XmlStreamReaderException ex) {
940            if (lenient) {
941                return doLenientDetection(httpContentType, ex);
942            }
943            throw ex;
944        }
945    }
946
947    /**
948     * Reads the underlying reader's {@code read(char[], int, int)} method.
949     *
950     * @param buf    the buffer to read the characters into
951     * @param offset The start offset
952     * @param len    The number of bytes to read
953     * @return the number of characters read or -1 if the end of stream
954     * @throws IOException if an I/O error occurs.
955     */
956    @Override
957    public int read(final char[] buf, final int offset, final int len) throws IOException {
958        return reader.read(buf, offset, len);
959    }
960
961}