XmlStreamReader.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.io.input;

  18. import java.io.BufferedInputStream;
  19. import java.io.BufferedReader;
  20. import java.io.File;
  21. import java.io.IOException;
  22. import java.io.InputStream;
  23. import java.io.InputStreamReader;
  24. import java.io.Reader;
  25. import java.io.StringReader;
  26. import java.net.HttpURLConnection;
  27. import java.net.URL;
  28. import java.net.URLConnection;
  29. import java.nio.charset.Charset;
  30. import java.nio.charset.StandardCharsets;
  31. import java.nio.file.Files;
  32. import java.nio.file.Path;
  33. import java.text.MessageFormat;
  34. import java.util.Locale;
  35. import java.util.Objects;
  36. import java.util.regex.Matcher;
  37. import java.util.regex.Pattern;

  38. import org.apache.commons.io.ByteOrderMark;
  39. import org.apache.commons.io.Charsets;
  40. import org.apache.commons.io.IOUtils;
  41. import org.apache.commons.io.build.AbstractStreamBuilder;
  42. import org.apache.commons.io.function.IOConsumer;
  43. import org.apache.commons.io.output.XmlStreamWriter;

  44. /**
  45.  * Character stream that handles all the necessary Voodoo to figure out the charset encoding of the XML document within the stream.
  46.  * <p>
  47.  * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream.
  48.  * </p>
  49.  * <p>
  50.  * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the document as a valid XML. This is not 100%
  51.  * true, but it's close enough (UTF-8 BOM is not handled by all parsers right now, XmlStreamReader handles it and things work in all parsers).
  52.  * </p>
  53.  * <p>
  54.  * The XmlStreamReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering a wide set of constructors.
  55.  * </p>
  56.  * <p>
  57.  * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for a script (following HTTP MIME and XML
  58.  * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a href="https://diveintomark.org/archives/2004/02/13/xml-media-types">
  59.  * Determining the character encoding of a feed</a>.
  60.  * </p>
  61.  * <p>
  62.  * To build an instance, use {@link Builder}.
  63.  * </p>
  64.  * <p>
  65.  * Originally developed for <a href="https://rome.dev.java.net">ROME</a> under Apache License 2.0.
  66.  * </p>
  67.  *
  68.  * @see Builder
  69.  * @see org.apache.commons.io.output.XmlStreamWriter
  70.  * @since 2.0
  71.  */
  72. public class XmlStreamReader extends Reader {

  73.     // @formatter:off
  74.     /**
  75.      * Builds a new {@link XmlStreamWriter}.
  76.      *
  77.      * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
  78.      * <p>
  79.      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
  80.      * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
  81.      * </p>
  82.      * <p>
  83.      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
  84.      * </p>
  85.      * <p>
  86.      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
  87.      * </p>
  88.      * <p>
  89.      * Else if the XML prolog had a charset encoding that encoding is used.
  90.      * </p>
  91.      * <p>
  92.      * Else if the content type had a charset encoding that encoding is used.
  93.      * </p>
  94.      * <p>
  95.      * Else 'UTF-8' is used.
  96.      * </p>
  97.      * <p>
  98.      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
  99.      * </p>
  100.      * <p>
  101.      * For example:
  102.      * </p>
  103.      *
  104.      * <pre>{@code
  105.      * XmlStreamReader r = XmlStreamReader.builder()
  106.      *   .setPath(path)
  107.      *   .setCharset(StandardCharsets.UTF_8)
  108.      *   .get();
  109.      * }
  110.      * </pre>
  111.      *
  112.      * @see #get()
  113.      * @since 2.12.0
  114.      */
  115.     // @formatter:on
  116.     public static class Builder extends AbstractStreamBuilder<XmlStreamReader, Builder> {

  117.         private boolean nullCharset = true;
  118.         private boolean lenient = true;
  119.         private String httpContentType;

  120.         /**
  121.          * Constructs a new builder of {@link XmlStreamReader}.
  122.          */
  123.         public Builder() {
  124.             // empty
  125.         }

  126.         /**
  127.          * Builds a new {@link XmlStreamWriter}.
  128.          * <p>
  129.          * You must set an aspect that supports {@link #getInputStream()}, otherwise, this method throws an exception.
  130.          * </p>
  131.          * <p>
  132.          * This builder uses the following aspects:
  133.          * </p>
  134.          * <ul>
  135.          * <li>{@link #getInputStream()}</li>
  136.          * <li>{@link #getCharset()}</li>
  137.          * <li>lenient</li>
  138.          * <li>httpContentType</li>
  139.          * </ul>
  140.          *
  141.          * @return a new instance.
  142.          * @throws IllegalStateException         if the {@code origin} is {@code null}.
  143.          * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
  144.          * @throws IOException                   if an I/O error occurs converting to an {@link InputStream} using {@link #getInputStream()}.
  145.          * @throws XmlStreamReaderException thrown if the Charset encoding could not be determined according to the specification.
  146.          * @see #getInputStream()
  147.          * @see #getUnchecked()
  148.          */
  149.         @Override
  150.         public XmlStreamReader get() throws IOException {
  151.             final String defaultEncoding = nullCharset ? null : getCharset().name();
  152.             // @formatter:off
  153.             return httpContentType == null
  154.                     ? new XmlStreamReader(getInputStream(), lenient, defaultEncoding)
  155.                     : new XmlStreamReader(getInputStream(), httpContentType, lenient, defaultEncoding);
  156.             // @formatter:on
  157.         }

  158.         @Override
  159.         public Builder setCharset(final Charset charset) {
  160.             nullCharset = charset == null;
  161.             return super.setCharset(charset);
  162.         }

  163.         @Override
  164.         public Builder setCharset(final String charset) {
  165.             nullCharset = charset == null;
  166.             return super.setCharset(Charsets.toCharset(charset, getCharsetDefault()));
  167.         }

  168.         /**
  169.          * Sets the HTTP content type.
  170.          *
  171.          * @param httpContentType the HTTP content type.
  172.          * @return {@code this} instance.
  173.          */
  174.         public Builder setHttpContentType(final String httpContentType) {
  175.             this.httpContentType = httpContentType;
  176.             return this;
  177.         }

  178.         /**
  179.          * Sets the lenient toggle.
  180.          *
  181.          * @param lenient the lenient toggle.
  182.          * @return {@code this} instance.
  183.          */
  184.         public Builder setLenient(final boolean lenient) {
  185.             this.lenient = lenient;
  186.             return this;
  187.         }

  188.     }

  189.     private static final String UTF_8 = StandardCharsets.UTF_8.name();

  190.     private static final String US_ASCII = StandardCharsets.US_ASCII.name();

  191.     private static final String UTF_16BE = StandardCharsets.UTF_16BE.name();

  192.     private static final String UTF_16LE = StandardCharsets.UTF_16LE.name();

  193.     private static final String UTF_32BE = "UTF-32BE";

  194.     private static final String UTF_32LE = "UTF-32LE";

  195.     private static final String UTF_16 = StandardCharsets.UTF_16.name();

  196.     private static final String UTF_32 = "UTF-32";

  197.     private static final String EBCDIC = "CP1047";

  198.     private static final ByteOrderMark[] BOMS = { ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE,
  199.             ByteOrderMark.UTF_32LE };

  200.     /** UTF_16LE and UTF_32LE have the same two starting BOM bytes. */
  201.     private static final ByteOrderMark[] XML_GUESS_BYTES = { new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D),
  202.             new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00),
  203.             new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D),
  204.             new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00),
  205.             new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) };

  206.     private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=[\"']?([.[^; \"']]*)[\"']?");

  207.     /**
  208.      * Pattern capturing the encoding of the <a href="https://www.w3.org/TR/REC-xml/#sec-pi">{@code 'xml'} processing instruction</a>.
  209.      * <p>
  210.      * See also the <a href="https://www.w3.org/TR/2008/REC-xml-20081126/#NT-EncName">NT-EncName</a> XML specification.
  211.      * </p>
  212.      * <p>
  213.      * Note the documented pattern is:
  214.      * </p>
  215.      * <pre>
  216.      * EncName   ::=   [A-Za-z] ([A-Za-z0-9._] | '-')*
  217.      * </pre>
  218.      * <p>
  219.      * However this does not match all the aliases that are supported by Java. For example, {@code '437'}, {@code 'ISO_8859-1:1987'} and
  220.      * {@code 'ebcdic-de-273+euro'}.
  221.      * </p>
  222.      */
  223.     public static final Pattern ENCODING_PATTERN = Pattern.compile(
  224.     // @formatter:off
  225.             "^<\\?xml\\s+"
  226.             + "(?:version\\s*=\\s*(?:(?:\"1\\.[0-9]+\")|(?:'1.[0-9]+'))\\s+)??"
  227.             + "encoding\\s*=\\s*"
  228.             + "((?:\"[A-Za-z0-9][A-Za-z0-9._+:-]*\")"  // double-quoted
  229.             +  "|(?:'[A-Za-z0-9][A-Za-z0-9._+:-]*'))", // single-quoted
  230.             Pattern.MULTILINE);
  231.     // @formatter:on

  232.     private static final String RAW_EX_1 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";

  233.     private static final String RAW_EX_2 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";

  234.     private static final String HTTP_EX_1 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be null";

  235.     private static final String HTTP_EX_2 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";

  236.     private static final String HTTP_EX_3 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Illegal MIME";

  237.     /**
  238.      * Constructs a new {@link Builder}.
  239.      *
  240.      * @return a new {@link Builder}.
  241.      * @since 2.12.0
  242.      */
  243.     public static Builder builder() {
  244.         return new Builder();
  245.     }

  246.     /**
  247.      * Gets the charset parameter value, {@code null} if not present, {@code null} if httpContentType is {@code null}.
  248.      *
  249.      * @param httpContentType the HTTP content type
  250.      * @return The content type encoding (upcased)
  251.      */
  252.     static String getContentTypeEncoding(final String httpContentType) {
  253.         String encoding = null;
  254.         if (httpContentType != null) {
  255.             final int i = httpContentType.indexOf(";");
  256.             if (i > -1) {
  257.                 final String postMime = httpContentType.substring(i + 1);
  258.                 final Matcher m = CHARSET_PATTERN.matcher(postMime);
  259.                 encoding = m.find() ? m.group(1) : null;
  260.                 encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null;
  261.             }
  262.         }
  263.         return encoding;
  264.     }

  265.     /**
  266.      * Gets the MIME type or {@code null} if httpContentType is {@code null}.
  267.      *
  268.      * @param httpContentType the HTTP content type
  269.      * @return The mime content type
  270.      */
  271.     static String getContentTypeMime(final String httpContentType) {
  272.         String mime = null;
  273.         if (httpContentType != null) {
  274.             final int i = httpContentType.indexOf(";");
  275.             mime = i >= 0 ? httpContentType.substring(0, i) : httpContentType;
  276.             mime = mime.trim();
  277.         }
  278.         return mime;
  279.     }

  280.     /**
  281.      * Gets the encoding declared in the <?xml encoding=...?>, {@code null} if none.
  282.      *
  283.      * @param inputStream InputStream to create the reader from.
  284.      * @param guessedEnc  guessed encoding
  285.      * @return the encoding declared in the <?xml encoding=...?>
  286.      * @throws IOException thrown if there is a problem reading the stream.
  287.      */
  288.     private static String getXmlProlog(final InputStream inputStream, final String guessedEnc) throws IOException {
  289.         String encoding = null;
  290.         if (guessedEnc != null) {
  291.             final byte[] bytes = IOUtils.byteArray();
  292.             inputStream.mark(IOUtils.DEFAULT_BUFFER_SIZE);
  293.             int offset = 0;
  294.             int max = IOUtils.DEFAULT_BUFFER_SIZE;
  295.             int c = inputStream.read(bytes, offset, max);
  296.             int firstGT = -1;
  297.             String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning)
  298.             while (c != -1 && firstGT == -1 && offset < IOUtils.DEFAULT_BUFFER_SIZE) {
  299.                 offset += c;
  300.                 max -= c;
  301.                 c = inputStream.read(bytes, offset, max);
  302.                 xmlProlog = new String(bytes, 0, offset, guessedEnc);
  303.                 firstGT = xmlProlog.indexOf('>');
  304.             }
  305.             if (firstGT == -1) {
  306.                 if (c == -1) {
  307.                     throw new IOException("Unexpected end of XML stream");
  308.                 }
  309.                 throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes");
  310.             }
  311.             final int bytesRead = offset;
  312.             if (bytesRead > 0) {
  313.                 inputStream.reset();
  314.                 final BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1)));
  315.                 final StringBuilder prolog = new StringBuilder();
  316.                 IOConsumer.forEach(bReader.lines(), l -> prolog.append(l).append(' '));
  317.                 final Matcher m = ENCODING_PATTERN.matcher(prolog);
  318.                 if (m.find()) {
  319.                     encoding = m.group(1).toUpperCase(Locale.ROOT);
  320.                     encoding = encoding.substring(1, encoding.length() - 1);
  321.                 }
  322.             }
  323.         }
  324.         return encoding;
  325.     }

  326.     /**
  327.      * Tests if the MIME type belongs to the APPLICATION XML family.
  328.      *
  329.      * @param mime The mime type
  330.      * @return true if the mime type belongs to the APPLICATION XML family, otherwise false
  331.      */
  332.     static boolean isAppXml(final String mime) {
  333.         return mime != null && (mime.equals("application/xml") || mime.equals("application/xml-dtd") || mime.equals("application/xml-external-parsed-entity")
  334.                 || mime.startsWith("application/") && mime.endsWith("+xml"));
  335.     }

  336.     /**
  337.      * Tests if the MIME type belongs to the TEXT XML family.
  338.      *
  339.      * @param mime The mime type
  340.      * @return true if the mime type belongs to the TEXT XML family, otherwise false
  341.      */
  342.     static boolean isTextXml(final String mime) {
  343.         return mime != null && (mime.equals("text/xml") || mime.equals("text/xml-external-parsed-entity") || mime.startsWith("text/") && mime.endsWith("+xml"));
  344.     }

  345.     private final Reader reader;

  346.     private final String encoding;

  347.     private final String defaultEncoding;

  348.     /**
  349.      * Constructs a Reader for a File.
  350.      * <p>
  351.      * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
  352.      * </p>
  353.      * <p>
  354.      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
  355.      * </p>
  356.      *
  357.      * @param file File to create a Reader from.
  358.      * @throws NullPointerException if the input is {@code null}.
  359.      * @throws IOException          thrown if there is a problem reading the file.
  360.      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
  361.      */
  362.     @Deprecated
  363.     public XmlStreamReader(final File file) throws IOException {
  364.         this(Objects.requireNonNull(file, "file").toPath());
  365.     }

  366.     /**
  367.      * Constructs a Reader for a raw InputStream.
  368.      * <p>
  369.      * It follows the same logic used for files.
  370.      * </p>
  371.      * <p>
  372.      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
  373.      * </p>
  374.      *
  375.      * @param inputStream InputStream to create a Reader from.
  376.      * @throws NullPointerException if the input stream is {@code null}.
  377.      * @throws IOException          thrown if there is a problem reading the stream.
  378.      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
  379.      */
  380.     @Deprecated
  381.     public XmlStreamReader(final InputStream inputStream) throws IOException {
  382.         this(inputStream, true);
  383.     }

  384.     /**
  385.      * Constructs a Reader for a raw InputStream.
  386.      * <p>
  387.      * It follows the same logic used for files.
  388.      * </p>
  389.      * <p>
  390.      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
  391.      * </p>
  392.      * <p>
  393.      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
  394.      * </p>
  395.      * <p>
  396.      * Else if the XML prolog had a charset encoding that encoding is used.
  397.      * </p>
  398.      * <p>
  399.      * Else if the content type had a charset encoding that encoding is used.
  400.      * </p>
  401.      * <p>
  402.      * Else 'UTF-8' is used.
  403.      * </p>
  404.      * <p>
  405.      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
  406.      * </p>
  407.      *
  408.      * @param inputStream InputStream to create a Reader from.
  409.      * @param lenient     indicates if the charset encoding detection should be relaxed.
  410.      * @throws NullPointerException     if the input stream is {@code null}.
  411.      * @throws IOException              thrown if there is a problem reading the stream.
  412.      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
  413.      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
  414.      */
  415.     @Deprecated
  416.     public XmlStreamReader(final InputStream inputStream, final boolean lenient) throws IOException {
  417.         this(inputStream, lenient, null);
  418.     }

  419.     /**
  420.      * Constructs a Reader for a raw InputStream.
  421.      * <p>
  422.      * It follows the same logic used for files.
  423.      * </p>
  424.      * <p>
  425.      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
  426.      * </p>
  427.      * <p>
  428.      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
  429.      * </p>
  430.      * <p>
  431.      * Else if the XML prolog had a charset encoding that encoding is used.
  432.      * </p>
  433.      * <p>
  434.      * Else if the content type had a charset encoding that encoding is used.
  435.      * </p>
  436.      * <p>
  437.      * Else 'UTF-8' is used.
  438.      * </p>
  439.      * <p>
  440.      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
  441.      * </p>
  442.      *
  443.      * @param inputStream     InputStream to create a Reader from.
  444.      * @param lenient         indicates if the charset encoding detection should be relaxed.
  445.      * @param defaultEncoding The default encoding
  446.      * @throws NullPointerException     if the input stream is {@code null}.
  447.      * @throws IOException              thrown if there is a problem reading the stream.
  448.      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
  449.      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
  450.      */
  451.     @Deprecated
  452.     @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
  453.     public XmlStreamReader(final InputStream inputStream, final boolean lenient, final String defaultEncoding) throws IOException {
  454.         this.defaultEncoding = defaultEncoding;
  455.         final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
  456.                 false, BOMS);
  457.         final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
  458.         this.encoding = processHttpStream(bom, pis, lenient);
  459.         this.reader = new InputStreamReader(pis, encoding);
  460.     }

  461.     /**
  462.      * Constructs a Reader using an InputStream and the associated content-type header.
  463.      * <p>
  464.      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
  465.      * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
  466.      * </p>
  467.      * <p>
  468.      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
  469.      * </p>
  470.      *
  471.      * @param inputStream     InputStream to create the reader from.
  472.      * @param httpContentType content-type header to use for the resolution of the charset encoding.
  473.      * @throws NullPointerException if the input stream is {@code null}.
  474.      * @throws IOException          thrown if there is a problem reading the file.
  475.      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
  476.      */
  477.     @Deprecated
  478.     public XmlStreamReader(final InputStream inputStream, final String httpContentType) throws IOException {
  479.         this(inputStream, httpContentType, true);
  480.     }

  481.     /**
  482.      * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
  483.      * <p>
  484.      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
  485.      * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
  486.      * </p>
  487.      * <p>
  488.      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
  489.      * </p>
  490.      * <p>
  491.      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
  492.      * </p>
  493.      * <p>
  494.      * Else if the XML prolog had a charset encoding that encoding is used.
  495.      * </p>
  496.      * <p>
  497.      * Else if the content type had a charset encoding that encoding is used.
  498.      * </p>
  499.      * <p>
  500.      * Else 'UTF-8' is used.
  501.      * </p>
  502.      * <p>
  503.      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
  504.      * </p>
  505.      *
  506.      * @param inputStream     InputStream to create the reader from.
  507.      * @param httpContentType content-type header to use for the resolution of the charset encoding.
  508.      * @param lenient         indicates if the charset encoding detection should be relaxed.
  509.      * @throws NullPointerException     if the input stream is {@code null}.
  510.      * @throws IOException              thrown if there is a problem reading the file.
  511.      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
  512.      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
  513.      */
  514.     @Deprecated
  515.     public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient) throws IOException {
  516.         this(inputStream, httpContentType, lenient, null);
  517.     }

  518.     /**
  519.      * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
  520.      * <p>
  521.      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
  522.      * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
  523.      * </p>
  524.      * <p>
  525.      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
  526.      * </p>
  527.      * <p>
  528.      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
  529.      * </p>
  530.      * <p>
  531.      * Else if the XML prolog had a charset encoding that encoding is used.
  532.      * </p>
  533.      * <p>
  534.      * Else if the content type had a charset encoding that encoding is used.
  535.      * </p>
  536.      * <p>
  537.      * Else 'UTF-8' is used.
  538.      * </p>
  539.      * <p>
  540.      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
  541.      * </p>
  542.      *
  543.      * @param inputStream     InputStream to create the reader from.
  544.      * @param httpContentType content-type header to use for the resolution of the charset encoding.
  545.      * @param lenient         indicates if the charset encoding detection should be relaxed.
  546.      * @param defaultEncoding The default encoding
  547.      * @throws NullPointerException     if the input stream is {@code null}.
  548.      * @throws IOException              thrown if there is a problem reading the file.
  549.      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
  550.      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
  551.      */
  552.     @Deprecated
  553.     @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
  554.     public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient, final String defaultEncoding)
  555.             throws IOException {
  556.         this.defaultEncoding = defaultEncoding;
  557.         final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
  558.                 false, BOMS);
  559.         final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
  560.         this.encoding = processHttpStream(bom, pis, lenient, httpContentType);
  561.         this.reader = new InputStreamReader(pis, encoding);
  562.     }

  563.     /**
  564.      * Constructs a Reader for a File.
  565.      * <p>
  566.      * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
  567.      * </p>
  568.      * <p>
  569.      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
  570.      * </p>
  571.      *
  572.      * @param file File to create a Reader from.
  573.      * @throws NullPointerException if the input is {@code null}.
  574.      * @throws IOException          thrown if there is a problem reading the file.
  575.      * @since 2.11.0
  576.      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
  577.      */
  578.     @Deprecated
  579.     @SuppressWarnings("resource") // InputStream is managed through another reader in this instance.
  580.     public XmlStreamReader(final Path file) throws IOException {
  581.         this(Files.newInputStream(Objects.requireNonNull(file, "file")));
  582.     }

  583.     /**
  584.      * Constructs a Reader using the InputStream of a URL.
  585.      * <p>
  586.      * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic used for Files.
  587.      * </p>
  588.      * <p>
  589.      * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with content-type.
  590.      * </p>
  591.      * <p>
  592.      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
  593.      * </p>
  594.      *
  595.      * @param url URL to create a Reader from.
  596.      * @throws NullPointerException if the input is {@code null}.
  597.      * @throws IOException          thrown if there is a problem reading the stream of the URL.
  598.      */
  599.     public XmlStreamReader(final URL url) throws IOException {
  600.         this(Objects.requireNonNull(url, "url").openConnection(), null);
  601.     }

  602.     /**
  603.      * Constructs a Reader using the InputStream of a URLConnection.
  604.      * <p>
  605.      * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data it uses the same logic used for files.
  606.      * </p>
  607.      * <p>
  608.      * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with
  609.      * content-type.
  610.      * </p>
  611.      * <p>
  612.      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
  613.      * </p>
  614.      *
  615.      * @param urlConnection   URLConnection to create a Reader from.
  616.      * @param defaultEncoding The default encoding
  617.      * @throws NullPointerException if the input is {@code null}.
  618.      * @throws IOException          thrown if there is a problem reading the stream of the URLConnection.
  619.      */
  620.     public XmlStreamReader(final URLConnection urlConnection, final String defaultEncoding) throws IOException {
  621.         Objects.requireNonNull(urlConnection, "urlConnection");
  622.         this.defaultEncoding = defaultEncoding;
  623.         final boolean lenient = true;
  624.         final String contentType = urlConnection.getContentType();
  625.         final InputStream inputStream = urlConnection.getInputStream();
  626.         @SuppressWarnings("resource") // managed by the InputStreamReader tracked by this instance
  627.         // @formatter:off
  628.         final BOMInputStream bomInput = BOMInputStream.builder()
  629.             .setInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE))
  630.             .setInclude(false)
  631.             .setByteOrderMarks(BOMS)
  632.             .get();
  633.         @SuppressWarnings("resource")
  634.         final BOMInputStream piInput = BOMInputStream.builder()
  635.             .setInputStream(new BufferedInputStream(bomInput, IOUtils.DEFAULT_BUFFER_SIZE))
  636.             .setInclude(true)
  637.             .setByteOrderMarks(XML_GUESS_BYTES)
  638.             .get();
  639.         // @formatter:on
  640.         if (urlConnection instanceof HttpURLConnection || contentType != null) {
  641.             this.encoding = processHttpStream(bomInput, piInput, lenient, contentType);
  642.         } else {
  643.             this.encoding = processHttpStream(bomInput, piInput, lenient);
  644.         }
  645.         this.reader = new InputStreamReader(piInput, encoding);
  646.     }

  647.     /**
  648.      * Calculates the HTTP encoding.
  649.      * @param bomEnc          BOM encoding
  650.      * @param xmlGuessEnc     XML Guess encoding
  651.      * @param xmlEnc          XML encoding
  652.      * @param lenient         indicates if the charset encoding detection should be relaxed.
  653.      * @param httpContentType The HTTP content type
  654.      * @return the HTTP encoding
  655.      * @throws IOException thrown if there is a problem reading the stream.
  656.      */
  657.     String calculateHttpEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc, final boolean lenient, final String httpContentType)
  658.             throws IOException {

  659.         // Lenient and has XML encoding
  660.         if (lenient && xmlEnc != null) {
  661.             return xmlEnc;
  662.         }

  663.         // Determine mime/encoding content types from HTTP Content Type
  664.         final String cTMime = getContentTypeMime(httpContentType);
  665.         final String cTEnc = getContentTypeEncoding(httpContentType);
  666.         final boolean appXml = isAppXml(cTMime);
  667.         final boolean textXml = isTextXml(cTMime);

  668.         // Mime type NOT "application/xml" or "text/xml"
  669.         if (!appXml && !textXml) {
  670.             final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
  671.             throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
  672.         }

  673.         // No content type encoding
  674.         if (cTEnc == null) {
  675.             if (appXml) {
  676.                 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
  677.             }
  678.             return defaultEncoding == null ? US_ASCII : defaultEncoding;
  679.         }

  680.         // UTF-16BE or UTF-16LE content type encoding
  681.         if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
  682.             if (bomEnc != null) {
  683.                 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
  684.                 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
  685.             }
  686.             return cTEnc;
  687.         }

  688.         // UTF-16 content type encoding
  689.         if (cTEnc.equals(UTF_16)) {
  690.             if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
  691.                 return bomEnc;
  692.             }
  693.             final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
  694.             throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
  695.         }

  696.         // UTF-32BE or UTF-132E content type encoding
  697.         if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) {
  698.             if (bomEnc != null) {
  699.                 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
  700.                 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
  701.             }
  702.             return cTEnc;
  703.         }

  704.         // UTF-32 content type encoding
  705.         if (cTEnc.equals(UTF_32)) {
  706.             if (bomEnc != null && bomEnc.startsWith(UTF_32)) {
  707.                 return bomEnc;
  708.             }
  709.             final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
  710.             throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
  711.         }

  712.         return cTEnc;
  713.     }

  714.     /**
  715.      * Calculate the raw encoding.
  716.      *
  717.      * @param bomEnc      BOM encoding
  718.      * @param xmlGuessEnc XML Guess encoding
  719.      * @param xmlEnc      XML encoding
  720.      * @return the raw encoding
  721.      * @throws IOException thrown if there is a problem reading the stream.
  722.      */
  723.     String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc) throws IOException {

  724.         // BOM is Null
  725.         if (bomEnc == null) {
  726.             if (xmlGuessEnc == null || xmlEnc == null) {
  727.                 return defaultEncoding == null ? UTF_8 : defaultEncoding;
  728.             }
  729.             if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
  730.                 return xmlGuessEnc;
  731.             }
  732.             return xmlEnc;
  733.         }

  734.         // BOM is UTF-8
  735.         if (bomEnc.equals(UTF_8)) {
  736.             if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
  737.                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
  738.                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
  739.             }
  740.             if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
  741.                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
  742.                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
  743.             }
  744.             return bomEnc;
  745.         }

  746.         // BOM is UTF-16BE or UTF-16LE
  747.         if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
  748.             if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
  749.                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
  750.                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
  751.             }
  752.             if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
  753.                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
  754.                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
  755.             }
  756.             return bomEnc;
  757.         }

  758.         // BOM is UTF-32BE or UTF-32LE
  759.         if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) {
  760.             if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
  761.                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
  762.                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
  763.             }
  764.             if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) {
  765.                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
  766.                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
  767.             }
  768.             return bomEnc;
  769.         }

  770.         // BOM is something else
  771.         final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc);
  772.         throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
  773.     }

  774.     /**
  775.      * Closes the XmlStreamReader stream.
  776.      *
  777.      * @throws IOException thrown if there was a problem closing the stream.
  778.      */
  779.     @Override
  780.     public void close() throws IOException {
  781.         reader.close();
  782.     }

  783.     /**
  784.      * Does lenient detection.
  785.      *
  786.      * @param httpContentType content-type header to use for the resolution of the charset encoding.
  787.      * @param ex              The thrown exception
  788.      * @return the encoding
  789.      * @throws IOException thrown if there is a problem reading the stream.
  790.      */
  791.     private String doLenientDetection(String httpContentType, XmlStreamReaderException ex) throws IOException {
  792.         if (httpContentType != null && httpContentType.startsWith("text/html")) {
  793.             httpContentType = httpContentType.substring("text/html".length());
  794.             httpContentType = "text/xml" + httpContentType;
  795.             try {
  796.                 return calculateHttpEncoding(ex.getBomEncoding(), ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true, httpContentType);
  797.             } catch (final XmlStreamReaderException ex2) {
  798.                 ex = ex2;
  799.             }
  800.         }
  801.         String encoding = ex.getXmlEncoding();
  802.         if (encoding == null) {
  803.             encoding = ex.getContentTypeEncoding();
  804.         }
  805.         if (encoding == null) {
  806.             encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
  807.         }
  808.         return encoding;
  809.     }

  810.     /**
  811.      * Gets the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on content-type are not adequate.
  812.      * <p>
  813.      * If it is {@code null} the content-type based rules are used.
  814.      * </p>
  815.      *
  816.      * @return the default encoding to use.
  817.      */
  818.     public String getDefaultEncoding() {
  819.         return defaultEncoding;
  820.     }

  821.     /**
  822.      * Gets the charset encoding of the XmlStreamReader.
  823.      *
  824.      * @return charset encoding.
  825.      */
  826.     public String getEncoding() {
  827.         return encoding;
  828.     }

  829.     /**
  830.      * Process the raw stream.
  831.      *
  832.      * @param bomInput     BOMInputStream to detect byte order marks
  833.      * @param piInput     BOMInputStream to guess XML encoding
  834.      * @param lenient indicates if the charset encoding detection should be relaxed.
  835.      * @return the encoding to be used
  836.      * @throws IOException thrown if there is a problem reading the stream.
  837.      */
  838.     private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient) throws IOException {
  839.         final String bomEnc = bomInput.getBOMCharsetName();
  840.         final String xmlGuessEnc = piInput.getBOMCharsetName();
  841.         final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
  842.         try {
  843.             return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
  844.         } catch (final XmlStreamReaderException ex) {
  845.             if (lenient) {
  846.                 return doLenientDetection(null, ex);
  847.             }
  848.             throw ex;
  849.         }
  850.     }

  851.     /**
  852.      * Processes an HTTP stream.
  853.      *
  854.      * @param bomInput        BOMInputStream to detect byte order marks
  855.      * @param piInput         BOMInputStream to guess XML encoding
  856.      * @param lenient         indicates if the charset encoding detection should be relaxed.
  857.      * @param httpContentType The HTTP content type
  858.      * @return the encoding to be used
  859.      * @throws IOException thrown if there is a problem reading the stream.
  860.      */
  861.     private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient, final String httpContentType)
  862.             throws IOException {
  863.         final String bomEnc = bomInput.getBOMCharsetName();
  864.         final String xmlGuessEnc = piInput.getBOMCharsetName();
  865.         final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
  866.         try {
  867.             return calculateHttpEncoding(bomEnc, xmlGuessEnc, xmlEnc, lenient, httpContentType);
  868.         } catch (final XmlStreamReaderException ex) {
  869.             if (lenient) {
  870.                 return doLenientDetection(httpContentType, ex);
  871.             }
  872.             throw ex;
  873.         }
  874.     }

  875.     /**
  876.      * Reads the underlying reader's {@code read(char[], int, int)} method.
  877.      *
  878.      * @param buf    the buffer to read the characters into
  879.      * @param offset The start offset
  880.      * @param len    The number of bytes to read
  881.      * @return the number of characters read or -1 if the end of stream
  882.      * @throws IOException if an I/O error occurs.
  883.      */
  884.     @Override
  885.     public int read(final char[] buf, final int offset, final int len) throws IOException {
  886.         return reader.read(buf, offset, len);
  887.     }

  888. }