001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import java.io.BufferedInputStream;
020import java.io.BufferedReader;
021import java.io.File;
022import java.io.IOException;
023import java.io.InputStream;
024import java.io.InputStreamReader;
025import java.io.Reader;
026import java.io.StringReader;
027import java.net.HttpURLConnection;
028import java.net.URL;
029import java.net.URLConnection;
030import java.nio.charset.Charset;
031import java.nio.charset.StandardCharsets;
032import java.nio.file.Files;
033import java.nio.file.Path;
034import java.text.MessageFormat;
035import java.util.Locale;
036import java.util.Objects;
037import java.util.regex.Matcher;
038import java.util.regex.Pattern;
039
040import org.apache.commons.io.ByteOrderMark;
041import org.apache.commons.io.Charsets;
042import org.apache.commons.io.IOUtils;
043import org.apache.commons.io.build.AbstractStreamBuilder;
044import org.apache.commons.io.function.IOConsumer;
045import org.apache.commons.io.output.XmlStreamWriter;
046
047/**
048 * Character stream that handles all the necessary Voodoo to figure out the charset encoding of the XML document within the stream.
049 * <p>
050 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream.
051 * </p>
052 * <p>
053 * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the document as a valid XML. This is not 100%
054 * true, but it's close enough (UTF-8 BOM is not handled by all parsers right now, XmlStreamReader handles it and things work in all parsers).
055 * </p>
056 * <p>
057 * The XmlStreamReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering a wide set of constructors.
058 * </p>
059 * <p>
060 * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for a script (following HTTP MIME and XML
061 * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a href="https://diveintomark.org/archives/2004/02/13/xml-media-types">
062 * Determining the character encoding of a feed</a>.
063 * </p>
064 * <p>
065 * To build an instance, use {@link Builder}.
066 * </p>
067 * <p>
068 * Originally developed for <a href="https://rome.dev.java.net">ROME</a> under Apache License 2.0.
069 * </p>
070 *
071 * @see Builder
072 * @see org.apache.commons.io.output.XmlStreamWriter
073 * @since 2.0
074 */
075public class XmlStreamReader extends Reader {
076
077    // @formatter:off
078    /**
079     * Builds a new {@link XmlStreamWriter}.
080     *
081     * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
082     * <p>
083     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
084     * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
085     * </p>
086     * <p>
087     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
088     * </p>
089     * <p>
090     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
091     * </p>
092     * <p>
093     * Else if the XML prolog had a charset encoding that encoding is used.
094     * </p>
095     * <p>
096     * Else if the content type had a charset encoding that encoding is used.
097     * </p>
098     * <p>
099     * Else 'UTF-8' is used.
100     * </p>
101     * <p>
102     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
103     * </p>
104     * <p>
105     * For example:
106     * </p>
107     *
108     * <pre>{@code
109     * XmlStreamReader r = XmlStreamReader.builder()
110     *   .setPath(path)
111     *   .setCharset(StandardCharsets.UTF_8)
112     *   .get();
113     * }
114     * </pre>
115     *
116     * @see #get()
117     * @since 2.12.0
118     */
119    // @formatter:on
120    public static class Builder extends AbstractStreamBuilder<XmlStreamReader, Builder> {
121
122        private boolean nullCharset = true;
123        private boolean lenient = true;
124        private String httpContentType;
125
126        /**
127         * Builds a new {@link XmlStreamWriter}.
128         * <p>
129         * You must set input that supports {@link #getInputStream()}, otherwise, this method throws an exception.
130         * </p>
131         * <p>
132         * This builder use the following aspects:
133         * </p>
134         * <ul>
135         * <li>{@link #getInputStream()}</li>
136         * <li>{@link #getCharset()}</li>
137         * <li>lenient</li>
138         * <li>httpContentType</li>
139         * </ul>
140         *
141         * @return a new instance.
142         * @throws IllegalStateException         if the {@code origin} is {@code null}.
143         * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
144         * @throws IOException                   if an I/O error occurs.
145         * @throws XmlStreamReaderException thrown if the Charset encoding could not be determined according to the specification.
146         * @see #getInputStream()
147         */
148        @SuppressWarnings("resource")
149        @Override
150        public XmlStreamReader get() throws IOException {
151            final String defaultEncoding = nullCharset ? null : getCharset().name();
152            // @formatter:off
153            return httpContentType == null
154                    ? new XmlStreamReader(getInputStream(), lenient, defaultEncoding)
155                    : new XmlStreamReader(getInputStream(), httpContentType, lenient, defaultEncoding);
156            // @formatter:on
157        }
158
159        @Override
160        public Builder setCharset(final Charset charset) {
161            nullCharset = charset == null;
162            return super.setCharset(charset);
163        }
164
165        @Override
166        public Builder setCharset(final String charset) {
167            nullCharset = charset == null;
168            return super.setCharset(Charsets.toCharset(charset, getCharsetDefault()));
169        }
170
171        /**
172         * Sets the HTTP content type.
173         *
174         * @param httpContentType the HTTP content type.
175         * @return this.
176         */
177        public Builder setHttpContentType(final String httpContentType) {
178            this.httpContentType = httpContentType;
179            return this;
180        }
181
182        /**
183         * Sets the lenient toggle.
184         *
185         * @param lenient the lenient toggle.
186         * @return this.
187         */
188        public Builder setLenient(final boolean lenient) {
189            this.lenient = lenient;
190            return this;
191        }
192
193    }
194
195    private static final String UTF_8 = StandardCharsets.UTF_8.name();
196
197    private static final String US_ASCII = StandardCharsets.US_ASCII.name();
198
199    private static final String UTF_16BE = StandardCharsets.UTF_16BE.name();
200
201    private static final String UTF_16LE = StandardCharsets.UTF_16LE.name();
202
203    private static final String UTF_32BE = "UTF-32BE";
204
205    private static final String UTF_32LE = "UTF-32LE";
206
207    private static final String UTF_16 = StandardCharsets.UTF_16.name();
208
209    private static final String UTF_32 = "UTF-32";
210
211    private static final String EBCDIC = "CP1047";
212
213    private static final ByteOrderMark[] BOMS = { ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE,
214            ByteOrderMark.UTF_32LE };
215
216    /** UTF_16LE and UTF_32LE have the same two starting BOM bytes. */
217    private static final ByteOrderMark[] XML_GUESS_BYTES = { new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D),
218            new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00),
219            new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D),
220            new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00),
221            new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) };
222
223    private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=[\"']?([.[^; \"']]*)[\"']?");
224
225    /**
226     * Pattern capturing the encoding of the <a href="https://www.w3.org/TR/REC-xml/#sec-pi">{@code 'xml'} processing instruction</a>.
227     * <p>
228     * See also the <a href="https://www.w3.org/TR/2008/REC-xml-20081126/#NT-EncName">NT-EncName</a> XML specification.
229     * </p>
230     * <p>
231     * Note the documented pattern is:
232     * </p>
233     * <pre>
234     * EncName   ::=   [A-Za-z] ([A-Za-z0-9._] | '-')*
235     * </pre>
236     * <p>
237     * However this does not match all the aliases that are supported by Java. For example, {@code '437'}, {@code 'ISO_8859-1:1987'} and
238     * {@code 'ebcdic-de-273+euro'}.
239     * </p>
240     */
241    public static final Pattern ENCODING_PATTERN = Pattern.compile(
242    // @formatter:off
243            "^<\\?xml\\s+"
244            + "(?:version\\s*=\\s*(?:(?:\"1\\.[0-9]+\")|(?:'1.[0-9]+'))\\s+)??"
245            + "encoding\\s*=\\s*"
246            + "((?:\"[A-Za-z0-9][A-Za-z0-9._+:-]*\")"  // double-quoted
247            +  "|(?:'[A-Za-z0-9][A-Za-z0-9._+:-]*'))", // single-quoted
248            Pattern.MULTILINE);
249    // @formatter:on
250
251    private static final String RAW_EX_1 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
252
253    private static final String RAW_EX_2 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";
254
255    private static final String HTTP_EX_1 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be null";
256
257    private static final String HTTP_EX_2 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";
258
259    private static final String HTTP_EX_3 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Illegal MIME";
260
261    /**
262     * Constructs a new {@link Builder}.
263     *
264     * @return a new {@link Builder}.
265     * @since 2.12.0
266     */
267    public static Builder builder() {
268        return new Builder();
269    }
270
271    /**
272     * Gets the charset parameter value, {@code null} if not present, {@code null} if httpContentType is {@code null}.
273     *
274     * @param httpContentType the HTTP content type
275     * @return The content type encoding (upcased)
276     */
277    static String getContentTypeEncoding(final String httpContentType) {
278        String encoding = null;
279        if (httpContentType != null) {
280            final int i = httpContentType.indexOf(";");
281            if (i > -1) {
282                final String postMime = httpContentType.substring(i + 1);
283                final Matcher m = CHARSET_PATTERN.matcher(postMime);
284                encoding = m.find() ? m.group(1) : null;
285                encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null;
286            }
287        }
288        return encoding;
289    }
290
291    /**
292     * Gets the MIME type or {@code null} if httpContentType is {@code null}.
293     *
294     * @param httpContentType the HTTP content type
295     * @return The mime content type
296     */
297    static String getContentTypeMime(final String httpContentType) {
298        String mime = null;
299        if (httpContentType != null) {
300            final int i = httpContentType.indexOf(";");
301            mime = i >= 0 ? httpContentType.substring(0, i) : httpContentType;
302            mime = mime.trim();
303        }
304        return mime;
305    }
306
307    /**
308     * Gets the encoding declared in the <?xml encoding=...?>, {@code null} if none.
309     *
310     * @param inputStream InputStream to create the reader from.
311     * @param guessedEnc  guessed encoding
312     * @return the encoding declared in the <?xml encoding=...?>
313     * @throws IOException thrown if there is a problem reading the stream.
314     */
315    private static String getXmlProlog(final InputStream inputStream, final String guessedEnc) throws IOException {
316        String encoding = null;
317        if (guessedEnc != null) {
318            final byte[] bytes = IOUtils.byteArray();
319            inputStream.mark(IOUtils.DEFAULT_BUFFER_SIZE);
320            int offset = 0;
321            int max = IOUtils.DEFAULT_BUFFER_SIZE;
322            int c = inputStream.read(bytes, offset, max);
323            int firstGT = -1;
324            String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning)
325            while (c != -1 && firstGT == -1 && offset < IOUtils.DEFAULT_BUFFER_SIZE) {
326                offset += c;
327                max -= c;
328                c = inputStream.read(bytes, offset, max);
329                xmlProlog = new String(bytes, 0, offset, guessedEnc);
330                firstGT = xmlProlog.indexOf('>');
331            }
332            if (firstGT == -1) {
333                if (c == -1) {
334                    throw new IOException("Unexpected end of XML stream");
335                }
336                throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes");
337            }
338            final int bytesRead = offset;
339            if (bytesRead > 0) {
340                inputStream.reset();
341                final BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1)));
342                final StringBuilder prolog = new StringBuilder();
343                IOConsumer.forEach(bReader.lines(), l -> prolog.append(l).append(' '));
344                final Matcher m = ENCODING_PATTERN.matcher(prolog);
345                if (m.find()) {
346                    encoding = m.group(1).toUpperCase(Locale.ROOT);
347                    encoding = encoding.substring(1, encoding.length() - 1);
348                }
349            }
350        }
351        return encoding;
352    }
353
354    /**
355     * Tests if the MIME type belongs to the APPLICATION XML family.
356     *
357     * @param mime The mime type
358     * @return true if the mime type belongs to the APPLICATION XML family, otherwise false
359     */
360    static boolean isAppXml(final String mime) {
361        return mime != null && (mime.equals("application/xml") || mime.equals("application/xml-dtd") || mime.equals("application/xml-external-parsed-entity")
362                || mime.startsWith("application/") && mime.endsWith("+xml"));
363    }
364
365    /**
366     * Tests if the MIME type belongs to the TEXT XML family.
367     *
368     * @param mime The mime type
369     * @return true if the mime type belongs to the TEXT XML family, otherwise false
370     */
371    static boolean isTextXml(final String mime) {
372        return mime != null && (mime.equals("text/xml") || mime.equals("text/xml-external-parsed-entity") || mime.startsWith("text/") && mime.endsWith("+xml"));
373    }
374
375    private final Reader reader;
376
377    private final String encoding;
378
379    private final String defaultEncoding;
380
381    /**
382     * Constructs a Reader for a File.
383     * <p>
384     * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
385     * </p>
386     * <p>
387     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
388     * </p>
389     *
390     * @param file File to create a Reader from.
391     * @throws NullPointerException if the input is {@code null}.
392     * @throws IOException          thrown if there is a problem reading the file.
393     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
394     */
395    @Deprecated
396    public XmlStreamReader(final File file) throws IOException {
397        this(Objects.requireNonNull(file, "file").toPath());
398    }
399
400    /**
401     * Constructs a Reader for a raw InputStream.
402     * <p>
403     * It follows the same logic used for files.
404     * </p>
405     * <p>
406     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
407     * </p>
408     *
409     * @param inputStream InputStream to create a Reader from.
410     * @throws NullPointerException if the input stream is {@code null}.
411     * @throws IOException          thrown if there is a problem reading the stream.
412     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
413     */
414    @Deprecated
415    public XmlStreamReader(final InputStream inputStream) throws IOException {
416        this(inputStream, true);
417    }
418
419    /**
420     * Constructs a Reader for a raw InputStream.
421     * <p>
422     * It follows the same logic used for files.
423     * </p>
424     * <p>
425     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
426     * </p>
427     * <p>
428     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
429     * </p>
430     * <p>
431     * Else if the XML prolog had a charset encoding that encoding is used.
432     * </p>
433     * <p>
434     * Else if the content type had a charset encoding that encoding is used.
435     * </p>
436     * <p>
437     * Else 'UTF-8' is used.
438     * </p>
439     * <p>
440     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
441     * </p>
442     *
443     * @param inputStream InputStream to create a Reader from.
444     * @param lenient     indicates if the charset encoding detection should be relaxed.
445     * @throws NullPointerException     if the input stream is {@code null}.
446     * @throws IOException              thrown if there is a problem reading the stream.
447     * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
448     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
449     */
450    @Deprecated
451    public XmlStreamReader(final InputStream inputStream, final boolean lenient) throws IOException {
452        this(inputStream, lenient, null);
453    }
454
455    /**
456     * Constructs a Reader for a raw InputStream.
457     * <p>
458     * It follows the same logic used for files.
459     * </p>
460     * <p>
461     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
462     * </p>
463     * <p>
464     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
465     * </p>
466     * <p>
467     * Else if the XML prolog had a charset encoding that encoding is used.
468     * </p>
469     * <p>
470     * Else if the content type had a charset encoding that encoding is used.
471     * </p>
472     * <p>
473     * Else 'UTF-8' is used.
474     * </p>
475     * <p>
476     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
477     * </p>
478     *
479     * @param inputStream     InputStream to create a Reader from.
480     * @param lenient         indicates if the charset encoding detection should be relaxed.
481     * @param defaultEncoding The default encoding
482     * @throws NullPointerException     if the input stream is {@code null}.
483     * @throws IOException              thrown if there is a problem reading the stream.
484     * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
485     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
486     */
487    @Deprecated
488    @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
489    public XmlStreamReader(final InputStream inputStream, final boolean lenient, final String defaultEncoding) throws IOException {
490        this.defaultEncoding = defaultEncoding;
491        final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
492                false, BOMS);
493        final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
494        this.encoding = processHttpStream(bom, pis, lenient);
495        this.reader = new InputStreamReader(pis, encoding);
496    }
497
498    /**
499     * Constructs a Reader using an InputStream and the associated content-type header.
500     * <p>
501     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
502     * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
503     * </p>
504     * <p>
505     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
506     * </p>
507     *
508     * @param inputStream     InputStream to create the reader from.
509     * @param httpContentType content-type header to use for the resolution of the charset encoding.
510     * @throws NullPointerException if the input stream is {@code null}.
511     * @throws IOException          thrown if there is a problem reading the file.
512     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
513     */
514    @Deprecated
515    public XmlStreamReader(final InputStream inputStream, final String httpContentType) throws IOException {
516        this(inputStream, httpContentType, true);
517    }
518
519    /**
520     * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
521     * <p>
522     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
523     * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
524     * </p>
525     * <p>
526     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
527     * </p>
528     * <p>
529     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
530     * </p>
531     * <p>
532     * Else if the XML prolog had a charset encoding that encoding is used.
533     * </p>
534     * <p>
535     * Else if the content type had a charset encoding that encoding is used.
536     * </p>
537     * <p>
538     * Else 'UTF-8' is used.
539     * </p>
540     * <p>
541     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
542     * </p>
543     *
544     * @param inputStream     InputStream to create the reader from.
545     * @param httpContentType content-type header to use for the resolution of the charset encoding.
546     * @param lenient         indicates if the charset encoding detection should be relaxed.
547     * @throws NullPointerException     if the input stream is {@code null}.
548     * @throws IOException              thrown if there is a problem reading the file.
549     * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
550     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
551     */
552    @Deprecated
553    public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient) throws IOException {
554        this(inputStream, httpContentType, lenient, null);
555    }
556
557    /**
558     * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
559     * <p>
560     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
561     * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
562     * </p>
563     * <p>
564     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
565     * </p>
566     * <p>
567     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
568     * </p>
569     * <p>
570     * Else if the XML prolog had a charset encoding that encoding is used.
571     * </p>
572     * <p>
573     * Else if the content type had a charset encoding that encoding is used.
574     * </p>
575     * <p>
576     * Else 'UTF-8' is used.
577     * </p>
578     * <p>
579     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
580     * </p>
581     *
582     * @param inputStream     InputStream to create the reader from.
583     * @param httpContentType content-type header to use for the resolution of the charset encoding.
584     * @param lenient         indicates if the charset encoding detection should be relaxed.
585     * @param defaultEncoding The default encoding
586     * @throws NullPointerException     if the input stream is {@code null}.
587     * @throws IOException              thrown if there is a problem reading the file.
588     * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
589     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
590     */
591    @Deprecated
592    @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
593    public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient, final String defaultEncoding)
594            throws IOException {
595        this.defaultEncoding = defaultEncoding;
596        final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
597                false, BOMS);
598        final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
599        this.encoding = processHttpStream(bom, pis, lenient, httpContentType);
600        this.reader = new InputStreamReader(pis, encoding);
601    }
602
603    /**
604     * Constructs a Reader for a File.
605     * <p>
606     * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
607     * </p>
608     * <p>
609     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
610     * </p>
611     *
612     * @param file File to create a Reader from.
613     * @throws NullPointerException if the input is {@code null}.
614     * @throws IOException          thrown if there is a problem reading the file.
615     * @since 2.11.0
616     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
617     */
618    @Deprecated
619    @SuppressWarnings("resource") // InputStream is managed through another reader in this instance.
620    public XmlStreamReader(final Path file) throws IOException {
621        this(Files.newInputStream(Objects.requireNonNull(file, "file")));
622    }
623
624    /**
625     * Constructs a Reader using the InputStream of a URL.
626     * <p>
627     * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic used for Files.
628     * </p>
629     * <p>
630     * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with content-type.
631     * </p>
632     * <p>
633     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
634     * </p>
635     *
636     * @param url URL to create a Reader from.
637     * @throws NullPointerException if the input is {@code null}.
638     * @throws IOException          thrown if there is a problem reading the stream of the URL.
639     */
640    public XmlStreamReader(final URL url) throws IOException {
641        this(Objects.requireNonNull(url, "url").openConnection(), null);
642    }
643
644    /**
645     * Constructs a Reader using the InputStream of a URLConnection.
646     * <p>
647     * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data it uses the same logic used for files.
648     * </p>
649     * <p>
650     * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with
651     * content-type.
652     * </p>
653     * <p>
654     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
655     * </p>
656     *
657     * @param urlConnection   URLConnection to create a Reader from.
658     * @param defaultEncoding The default encoding
659     * @throws NullPointerException if the input is {@code null}.
660     * @throws IOException          thrown if there is a problem reading the stream of the URLConnection.
661     */
662    public XmlStreamReader(final URLConnection urlConnection, final String defaultEncoding) throws IOException {
663        Objects.requireNonNull(urlConnection, "urlConnection");
664        this.defaultEncoding = defaultEncoding;
665        final boolean lenient = true;
666        final String contentType = urlConnection.getContentType();
667        final InputStream inputStream = urlConnection.getInputStream();
668        @SuppressWarnings("resource") // managed by the InputStreamReader tracked by this instance
669        // @formatter:off
670        final BOMInputStream bomInput = BOMInputStream.builder()
671            .setInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE))
672            .setInclude(false)
673            .setByteOrderMarks(BOMS)
674            .get();
675        @SuppressWarnings("resource")
676        final BOMInputStream piInput = BOMInputStream.builder()
677            .setInputStream(new BufferedInputStream(bomInput, IOUtils.DEFAULT_BUFFER_SIZE))
678            .setInclude(true)
679            .setByteOrderMarks(XML_GUESS_BYTES)
680            .get();
681        // @formatter:on
682        if (urlConnection instanceof HttpURLConnection || contentType != null) {
683            this.encoding = processHttpStream(bomInput, piInput, lenient, contentType);
684        } else {
685            this.encoding = processHttpStream(bomInput, piInput, lenient);
686        }
687        this.reader = new InputStreamReader(piInput, encoding);
688    }
689
690    /**
691     * Calculates the HTTP encoding.
692     * @param bomEnc          BOM encoding
693     * @param xmlGuessEnc     XML Guess encoding
694     * @param xmlEnc          XML encoding
695     * @param lenient         indicates if the charset encoding detection should be relaxed.
696     * @param httpContentType The HTTP content type
697     *
698     * @return the HTTP encoding
699     * @throws IOException thrown if there is a problem reading the stream.
700     */
701    String calculateHttpEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc, final boolean lenient, final String httpContentType)
702            throws IOException {
703
704        // Lenient and has XML encoding
705        if (lenient && xmlEnc != null) {
706            return xmlEnc;
707        }
708
709        // Determine mime/encoding content types from HTTP Content Type
710        final String cTMime = getContentTypeMime(httpContentType);
711        final String cTEnc = getContentTypeEncoding(httpContentType);
712        final boolean appXml = isAppXml(cTMime);
713        final boolean textXml = isTextXml(cTMime);
714
715        // Mime type NOT "application/xml" or "text/xml"
716        if (!appXml && !textXml) {
717            final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
718            throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
719        }
720
721        // No content type encoding
722        if (cTEnc == null) {
723            if (appXml) {
724                return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
725            }
726            return defaultEncoding == null ? US_ASCII : defaultEncoding;
727        }
728
729        // UTF-16BE or UTF-16LE content type encoding
730        if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
731            if (bomEnc != null) {
732                final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
733                throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
734            }
735            return cTEnc;
736        }
737
738        // UTF-16 content type encoding
739        if (cTEnc.equals(UTF_16)) {
740            if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
741                return bomEnc;
742            }
743            final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
744            throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
745        }
746
747        // UTF-32BE or UTF-132E content type encoding
748        if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) {
749            if (bomEnc != null) {
750                final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
751                throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
752            }
753            return cTEnc;
754        }
755
756        // UTF-32 content type encoding
757        if (cTEnc.equals(UTF_32)) {
758            if (bomEnc != null && bomEnc.startsWith(UTF_32)) {
759                return bomEnc;
760            }
761            final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
762            throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
763        }
764
765        return cTEnc;
766    }
767
768    /**
769     * Calculate the raw encoding.
770     *
771     * @param bomEnc      BOM encoding
772     * @param xmlGuessEnc XML Guess encoding
773     * @param xmlEnc      XML encoding
774     * @return the raw encoding
775     * @throws IOException thrown if there is a problem reading the stream.
776     */
777    String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc) throws IOException {
778
779        // BOM is Null
780        if (bomEnc == null) {
781            if (xmlGuessEnc == null || xmlEnc == null) {
782                return defaultEncoding == null ? UTF_8 : defaultEncoding;
783            }
784            if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
785                return xmlGuessEnc;
786            }
787            return xmlEnc;
788        }
789
790        // BOM is UTF-8
791        if (bomEnc.equals(UTF_8)) {
792            if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
793                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
794                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
795            }
796            if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
797                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
798                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
799            }
800            return bomEnc;
801        }
802
803        // BOM is UTF-16BE or UTF-16LE
804        if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
805            if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
806                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
807                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
808            }
809            if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
810                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
811                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
812            }
813            return bomEnc;
814        }
815
816        // BOM is UTF-32BE or UTF-32LE
817        if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) {
818            if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
819                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
820                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
821            }
822            if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) {
823                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
824                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
825            }
826            return bomEnc;
827        }
828
829        // BOM is something else
830        final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc);
831        throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
832    }
833
834    /**
835     * Closes the XmlStreamReader stream.
836     *
837     * @throws IOException thrown if there was a problem closing the stream.
838     */
839    @Override
840    public void close() throws IOException {
841        reader.close();
842    }
843
844    /**
845     * Does lenient detection.
846     *
847     * @param httpContentType content-type header to use for the resolution of the charset encoding.
848     * @param ex              The thrown exception
849     * @return the encoding
850     * @throws IOException thrown if there is a problem reading the stream.
851     */
852    private String doLenientDetection(String httpContentType, XmlStreamReaderException ex) throws IOException {
853        if (httpContentType != null && httpContentType.startsWith("text/html")) {
854            httpContentType = httpContentType.substring("text/html".length());
855            httpContentType = "text/xml" + httpContentType;
856            try {
857                return calculateHttpEncoding(ex.getBomEncoding(), ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true, httpContentType);
858            } catch (final XmlStreamReaderException ex2) {
859                ex = ex2;
860            }
861        }
862        String encoding = ex.getXmlEncoding();
863        if (encoding == null) {
864            encoding = ex.getContentTypeEncoding();
865        }
866        if (encoding == null) {
867            encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
868        }
869        return encoding;
870    }
871
872    /**
873     * Gets the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on content-type are not adequate.
874     * <p>
875     * If it is {@code null} the content-type based rules are used.
876     * </p>
877     *
878     * @return the default encoding to use.
879     */
880    public String getDefaultEncoding() {
881        return defaultEncoding;
882    }
883
884    /**
885     * Gets the charset encoding of the XmlStreamReader.
886     *
887     * @return charset encoding.
888     */
889    public String getEncoding() {
890        return encoding;
891    }
892
893    /**
894     * Process the raw stream.
895     *
896     * @param bomInput     BOMInputStream to detect byte order marks
897     * @param piInput     BOMInputStream to guess XML encoding
898     * @param lenient indicates if the charset encoding detection should be relaxed.
899     * @return the encoding to be used
900     * @throws IOException thrown if there is a problem reading the stream.
901     */
902    private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient) throws IOException {
903        final String bomEnc = bomInput.getBOMCharsetName();
904        final String xmlGuessEnc = piInput.getBOMCharsetName();
905        final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
906        try {
907            return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
908        } catch (final XmlStreamReaderException ex) {
909            if (lenient) {
910                return doLenientDetection(null, ex);
911            }
912            throw ex;
913        }
914    }
915
916    /**
917     * Processes an HTTP stream.
918     *
919     * @param bomInput        BOMInputStream to detect byte order marks
920     * @param piInput         BOMInputStream to guess XML encoding
921     * @param lenient         indicates if the charset encoding detection should be relaxed.
922     * @param httpContentType The HTTP content type
923     * @return the encoding to be used
924     * @throws IOException thrown if there is a problem reading the stream.
925     */
926    private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient, final String httpContentType)
927            throws IOException {
928        final String bomEnc = bomInput.getBOMCharsetName();
929        final String xmlGuessEnc = piInput.getBOMCharsetName();
930        final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
931        try {
932            return calculateHttpEncoding(bomEnc, xmlGuessEnc, xmlEnc, lenient, httpContentType);
933        } catch (final XmlStreamReaderException ex) {
934            if (lenient) {
935                return doLenientDetection(httpContentType, ex);
936            }
937            throw ex;
938        }
939    }
940
941    /**
942     * Reads the underlying reader's {@code read(char[], int, int)} method.
943     *
944     * @param buf    the buffer to read the characters into
945     * @param offset The start offset
946     * @param len    The number of bytes to read
947     * @return the number of characters read or -1 if the end of stream
948     * @throws IOException if an I/O error occurs.
949     */
950    @Override
951    public int read(final char[] buf, final int offset, final int len) throws IOException {
952        return reader.read(buf, offset, len);
953    }
954
955}