Coverage Report - org.apache.commons.io.input.XmlStreamReader
 
Classes in this File Line Coverage Branch Coverage Complexity
XmlStreamReader
88%
170/193
92%
148/160
6
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *      http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 package org.apache.commons.io.input;
 18  
 
 19  
 import java.io.BufferedInputStream;
 20  
 import java.io.BufferedReader;
 21  
 import java.io.File;
 22  
 import java.io.FileInputStream;
 23  
 import java.io.IOException;
 24  
 import java.io.InputStream;
 25  
 import java.io.InputStreamReader;
 26  
 import java.io.Reader;
 27  
 import java.io.StringReader;
 28  
 import java.net.HttpURLConnection;
 29  
 import java.net.URL;
 30  
 import java.net.URLConnection;
 31  
 import java.text.MessageFormat;
 32  
 import java.util.Locale;
 33  
 import java.util.regex.Matcher;
 34  
 import java.util.regex.Pattern;
 35  
 
 36  
 import org.apache.commons.io.ByteOrderMark;
 37  
 
 38  
 /**
 39  
  * Character stream that handles all the necessary Voodo to figure out the
 40  
  * charset encoding of the XML document within the stream.
 41  
  * <p>
 42  
  * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader.
 43  
  * This one IS a character stream.
 44  
  * <p>
 45  
  * All this has to be done without consuming characters from the stream, if not
 46  
  * the XML parser will not recognized the document as a valid XML. This is not
 47  
  * 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers
 48  
  * right now, XmlStreamReader handles it and things work in all parsers).
 49  
  * <p>
 50  
  * The XmlStreamReader class handles the charset encoding of XML documents in
 51  
  * Files, raw streams and HTTP streams by offering a wide set of constructors.
 52  
  * <p>
 53  
  * By default the charset encoding detection is lenient, the constructor with
 54  
  * the lenient flag can be used for an script (following HTTP MIME and XML
 55  
  * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a
 56  
  * href="http://diveintomark.org/archives/2004/02/13/xml-media-types">
 57  
  * Determining the character encoding of a feed</a>.
 58  
  * <p>
 59  
  * Originally developed for <a href="http://rome.dev.java.net">ROME</a> under
 60  
  * Apache License 2.0.
 61  
  *
 62  
  * @version $Id: XmlStreamReader.java 1686747 2015-06-21 18:44:49Z krosenvold $
 63  
  * @see org.apache.commons.io.output.XmlStreamWriter
 64  
  * @since 2.0
 65  
  */
 66  
 public class XmlStreamReader extends Reader {
 67  
     private static final int BUFFER_SIZE = 4096;
 68  
 
 69  
     private static final String UTF_8 = "UTF-8";
 70  
 
 71  
     private static final String US_ASCII = "US-ASCII";
 72  
 
 73  
     private static final String UTF_16BE = "UTF-16BE";
 74  
 
 75  
     private static final String UTF_16LE = "UTF-16LE";
 76  
 
 77  
     private static final String UTF_32BE = "UTF-32BE";
 78  
 
 79  
     private static final String UTF_32LE = "UTF-32LE";
 80  
 
 81  
     private static final String UTF_16 = "UTF-16";
 82  
 
 83  
     private static final String UTF_32 = "UTF-32";
 84  
 
 85  
     private static final String EBCDIC = "CP1047";
 86  
 
 87  5
     private static final ByteOrderMark[] BOMS = new ByteOrderMark[] {
 88  
         ByteOrderMark.UTF_8,
 89  
         ByteOrderMark.UTF_16BE,
 90  
         ByteOrderMark.UTF_16LE,
 91  
         ByteOrderMark.UTF_32BE,
 92  
         ByteOrderMark.UTF_32LE
 93  
     };
 94  
 
 95  
     // UTF_16LE and UTF_32LE have the same two starting BOM bytes.
 96  5
     private static final ByteOrderMark[] XML_GUESS_BYTES = new ByteOrderMark[] {
 97  
         new ByteOrderMark(UTF_8,    0x3C, 0x3F, 0x78, 0x6D),
 98  
         new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F),
 99  
         new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00),
 100  
         new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C,
 101  
                 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D),
 102  
         new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00,
 103  
                 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00),
 104  
         new ByteOrderMark(EBCDIC,   0x4C, 0x6F, 0xA7, 0x94)
 105  
     };
 106  
 
 107  
     private final Reader reader;
 108  
 
 109  
     private final String encoding;
 110  
 
 111  
     private final String defaultEncoding;
 112  
 
 113  
     /**
 114  
      * Returns the default encoding to use if none is set in HTTP content-type,
 115  
      * XML prolog and the rules based on content-type are not adequate.
 116  
      * <p>
 117  
      * If it is NULL the content-type based rules are used.
 118  
      *
 119  
      * @return the default encoding to use.
 120  
      */
 121  
     public String getDefaultEncoding() {
 122  0
         return defaultEncoding;
 123  
     }
 124  
 
 125  
     /**
 126  
      * Creates a Reader for a File.
 127  
      * <p>
 128  
      * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset,
 129  
      * if this is also missing defaults to UTF-8.
 130  
      * <p>
 131  
      * It does a lenient charset encoding detection, check the constructor with
 132  
      * the lenient parameter for details.
 133  
      *
 134  
      * @param file File to create a Reader from.
 135  
      * @throws IOException thrown if there is a problem reading the file.
 136  
      */
 137  
     public XmlStreamReader(final File file) throws IOException {
 138  0
         this(new FileInputStream(file));
 139  0
     }
 140  
 
 141  
     /**
 142  
      * Creates a Reader for a raw InputStream.
 143  
      * <p>
 144  
      * It follows the same logic used for files.
 145  
      * <p>
 146  
      * It does a lenient charset encoding detection, check the constructor with
 147  
      * the lenient parameter for details.
 148  
      *
 149  
      * @param is InputStream to create a Reader from.
 150  
      * @throws IOException thrown if there is a problem reading the stream.
 151  
      */
 152  
     public XmlStreamReader(final InputStream is) throws IOException {
 153  35
         this(is, true);
 154  35
     }
 155  
 
 156  
     /**
 157  
      * Creates a Reader for a raw InputStream.
 158  
      * <p>
 159  
      * It follows the same logic used for files.
 160  
      * <p>
 161  
      * If lenient detection is indicated and the detection above fails as per
 162  
      * specifications it then attempts the following:
 163  
      * <p>
 164  
      * If the content type was 'text/html' it replaces it with 'text/xml' and
 165  
      * tries the detection again.
 166  
      * <p>
 167  
      * Else if the XML prolog had a charset encoding that encoding is used.
 168  
      * <p>
 169  
      * Else if the content type had a charset encoding that encoding is used.
 170  
      * <p>
 171  
      * Else 'UTF-8' is used.
 172  
      * <p>
 173  
      * If lenient detection is indicated an XmlStreamReaderException is never
 174  
      * thrown.
 175  
      *
 176  
      * @param is InputStream to create a Reader from.
 177  
      * @param lenient indicates if the charset encoding detection should be
 178  
      *        relaxed.
 179  
      * @throws IOException thrown if there is a problem reading the stream.
 180  
      * @throws XmlStreamReaderException thrown if the charset encoding could not
 181  
      *         be determined according to the specs.
 182  
      */
 183  
     public XmlStreamReader(final InputStream is, final boolean lenient) throws IOException {
 184  67
         this(is, lenient, null);
 185  50
     }
 186  
 
 187  
     /**
 188  
      * Creates a Reader for a raw InputStream.
 189  
      * <p>
 190  
      * It follows the same logic used for files.
 191  
      * <p>
 192  
      * If lenient detection is indicated and the detection above fails as per
 193  
      * specifications it then attempts the following:
 194  
      * <p>
 195  
      * If the content type was 'text/html' it replaces it with 'text/xml' and
 196  
      * tries the detection again.
 197  
      * <p>
 198  
      * Else if the XML prolog had a charset encoding that encoding is used.
 199  
      * <p>
 200  
      * Else if the content type had a charset encoding that encoding is used.
 201  
      * <p>
 202  
      * Else 'UTF-8' is used.
 203  
      * <p>
 204  
      * If lenient detection is indicated an XmlStreamReaderException is never
 205  
      * thrown.
 206  
      *
 207  
      * @param is InputStream to create a Reader from.
 208  
      * @param lenient indicates if the charset encoding detection should be
 209  
      *        relaxed.
 210  
      * @param defaultEncoding The default encoding
 211  
      * @throws IOException thrown if there is a problem reading the stream.
 212  
      * @throws XmlStreamReaderException thrown if the charset encoding could not
 213  
      *         be determined according to the specs.
 214  
      */
 215  
     public XmlStreamReader(final InputStream is, final boolean lenient, final String defaultEncoding)
 216  67
             throws IOException {
 217  67
         this.defaultEncoding = defaultEncoding;
 218  67
         final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS);
 219  67
         final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
 220  67
         this.encoding = doRawStream(bom, pis, lenient);
 221  50
         this.reader = new InputStreamReader(pis, encoding);
 222  50
     }
 223  
 
 224  
     /**
 225  
      * Creates a Reader using the InputStream of a URL.
 226  
      * <p>
 227  
      * If the URL is not of type HTTP and there is not 'content-type' header in
 228  
      * the fetched data it uses the same logic used for Files.
 229  
      * <p>
 230  
      * If the URL is a HTTP Url or there is a 'content-type' header in the
 231  
      * fetched data it uses the same logic used for an InputStream with
 232  
      * content-type.
 233  
      * <p>
 234  
      * It does a lenient charset encoding detection, check the constructor with
 235  
      * the lenient parameter for details.
 236  
      *
 237  
      * @param url URL to create a Reader from.
 238  
      * @throws IOException thrown if there is a problem reading the stream of
 239  
      *         the URL.
 240  
      */
 241  
     public XmlStreamReader(final URL url) throws IOException {
 242  0
         this(url.openConnection(), null);
 243  0
     }
 244  
 
 245  
     /**
 246  
      * Creates a Reader using the InputStream of a URLConnection.
 247  
      * <p>
 248  
      * If the URLConnection is not of type HttpURLConnection and there is not
 249  
      * 'content-type' header in the fetched data it uses the same logic used for
 250  
      * files.
 251  
      * <p>
 252  
      * If the URLConnection is a HTTP Url or there is a 'content-type' header in
 253  
      * the fetched data it uses the same logic used for an InputStream with
 254  
      * content-type.
 255  
      * <p>
 256  
      * It does a lenient charset encoding detection, check the constructor with
 257  
      * the lenient parameter for details.
 258  
      *
 259  
      * @param conn URLConnection to create a Reader from.
 260  
      * @param defaultEncoding The default encoding
 261  
      * @throws IOException thrown if there is a problem reading the stream of
 262  
      *         the URLConnection.
 263  
      */
 264  0
     public XmlStreamReader(final URLConnection conn, final String defaultEncoding) throws IOException {
 265  0
         this.defaultEncoding = defaultEncoding;
 266  0
         final boolean lenient = true;
 267  0
         final String contentType = conn.getContentType();
 268  0
         final InputStream is = conn.getInputStream();
 269  0
         final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS);
 270  0
         final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
 271  0
         if (conn instanceof HttpURLConnection || contentType != null) {
 272  0
             this.encoding = doHttpStream(bom, pis, contentType, lenient);
 273  
         } else {
 274  0
             this.encoding = doRawStream(bom, pis, lenient);
 275  
         }
 276  0
         this.reader = new InputStreamReader(pis, encoding);
 277  0
     }
 278  
 
 279  
     /**
 280  
      * Creates a Reader using an InputStream an the associated content-type
 281  
      * header.
 282  
      * <p>
 283  
      * First it checks if the stream has BOM. If there is not BOM checks the
 284  
      * content-type encoding. If there is not content-type encoding checks the
 285  
      * XML prolog encoding. If there is not XML prolog encoding uses the default
 286  
      * encoding mandated by the content-type MIME type.
 287  
      * <p>
 288  
      * It does a lenient charset encoding detection, check the constructor with
 289  
      * the lenient parameter for details.
 290  
      *
 291  
      * @param is InputStream to create the reader from.
 292  
      * @param httpContentType content-type header to use for the resolution of
 293  
      *        the charset encoding.
 294  
      * @throws IOException thrown if there is a problem reading the file.
 295  
      */
 296  
     public XmlStreamReader(final InputStream is, final String httpContentType)
 297  
             throws IOException {
 298  1
         this(is, httpContentType, true);
 299  1
     }
 300  
 
 301  
     /**
 302  
      * Creates a Reader using an InputStream an the associated content-type
 303  
      * header. This constructor is lenient regarding the encoding detection.
 304  
      * <p>
 305  
      * First it checks if the stream has BOM. If there is not BOM checks the
 306  
      * content-type encoding. If there is not content-type encoding checks the
 307  
      * XML prolog encoding. If there is not XML prolog encoding uses the default
 308  
      * encoding mandated by the content-type MIME type.
 309  
      * <p>
 310  
      * If lenient detection is indicated and the detection above fails as per
 311  
      * specifications it then attempts the following:
 312  
      * <p>
 313  
      * If the content type was 'text/html' it replaces it with 'text/xml' and
 314  
      * tries the detection again.
 315  
      * <p>
 316  
      * Else if the XML prolog had a charset encoding that encoding is used.
 317  
      * <p>
 318  
      * Else if the content type had a charset encoding that encoding is used.
 319  
      * <p>
 320  
      * Else 'UTF-8' is used.
 321  
      * <p>
 322  
      * If lenient detection is indicated an XmlStreamReaderException is never
 323  
      * thrown.
 324  
      *
 325  
      * @param is InputStream to create the reader from.
 326  
      * @param httpContentType content-type header to use for the resolution of
 327  
      *        the charset encoding.
 328  
      * @param lenient indicates if the charset encoding detection should be
 329  
      *        relaxed.
 330  
      * @param defaultEncoding The default encoding
 331  
      * @throws IOException thrown if there is a problem reading the file.
 332  
      * @throws XmlStreamReaderException thrown if the charset encoding could not
 333  
      *         be determined according to the specs.
 334  
      */
 335  
     public XmlStreamReader(final InputStream is, final String httpContentType,
 336  184
             final boolean lenient, final String defaultEncoding) throws IOException {
 337  184
         this.defaultEncoding = defaultEncoding;
 338  184
         final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS);
 339  184
         final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
 340  184
         this.encoding = doHttpStream(bom, pis, httpContentType, lenient);
 341  163
         this.reader = new InputStreamReader(pis, encoding);
 342  163
     }
 343  
 
 344  
     /**
 345  
      * Creates a Reader using an InputStream an the associated content-type
 346  
      * header. This constructor is lenient regarding the encoding detection.
 347  
      * <p>
 348  
      * First it checks if the stream has BOM. If there is not BOM checks the
 349  
      * content-type encoding. If there is not content-type encoding checks the
 350  
      * XML prolog encoding. If there is not XML prolog encoding uses the default
 351  
      * encoding mandated by the content-type MIME type.
 352  
      * <p>
 353  
      * If lenient detection is indicated and the detection above fails as per
 354  
      * specifications it then attempts the following:
 355  
      * <p>
 356  
      * If the content type was 'text/html' it replaces it with 'text/xml' and
 357  
      * tries the detection again.
 358  
      * <p>
 359  
      * Else if the XML prolog had a charset encoding that encoding is used.
 360  
      * <p>
 361  
      * Else if the content type had a charset encoding that encoding is used.
 362  
      * <p>
 363  
      * Else 'UTF-8' is used.
 364  
      * <p>
 365  
      * If lenient detection is indicated an XmlStreamReaderException is never
 366  
      * thrown.
 367  
      *
 368  
      * @param is InputStream to create the reader from.
 369  
      * @param httpContentType content-type header to use for the resolution of
 370  
      *        the charset encoding.
 371  
      * @param lenient indicates if the charset encoding detection should be
 372  
      *        relaxed.
 373  
      * @throws IOException thrown if there is a problem reading the file.
 374  
      * @throws XmlStreamReaderException thrown if the charset encoding could not
 375  
      *         be determined according to the specs.
 376  
      */
 377  
     public XmlStreamReader(final InputStream is, final String httpContentType,
 378  
             final boolean lenient) throws IOException {
 379  67
         this(is, httpContentType, lenient, null);
 380  46
     }
 381  
 
 382  
     /**
 383  
      * Returns the charset encoding of the XmlStreamReader.
 384  
      *
 385  
      * @return charset encoding.
 386  
      */
 387  
     public String getEncoding() {
 388  100
         return encoding;
 389  
     }
 390  
 
 391  
     /**
 392  
      * Invokes the underlying reader's <code>read(char[], int, int)</code> method.
 393  
      * @param buf the buffer to read the characters into
 394  
      * @param offset The start offset
 395  
      * @param len The number of bytes to read
 396  
      * @return the number of characters read or -1 if the end of stream
 397  
      * @throws IOException if an I/O error occurs
 398  
      */
 399  
     @Override
 400  
     public int read(final char[] buf, final int offset, final int len) throws IOException {
 401  8
         return reader.read(buf, offset, len);
 402  
     }
 403  
 
 404  
     /**
 405  
      * Closes the XmlStreamReader stream.
 406  
      *
 407  
      * @throws IOException thrown if there was a problem closing the stream.
 408  
      */
 409  
     @Override
 410  
     public void close() throws IOException {
 411  163
         reader.close();
 412  163
     }
 413  
 
 414  
     /**
 415  
      * Process the raw stream.
 416  
      *
 417  
      * @param bom BOMInputStream to detect byte order marks
 418  
      * @param pis BOMInputStream to guess XML encoding
 419  
      * @param lenient indicates if the charset encoding detection should be
 420  
      *        relaxed.
 421  
      * @return the encoding to be used
 422  
      * @throws IOException thrown if there is a problem reading the stream.
 423  
      */
 424  
     private String doRawStream(final BOMInputStream bom, final BOMInputStream pis, final boolean lenient)
 425  
             throws IOException {
 426  67
         final String bomEnc      = bom.getBOMCharsetName();
 427  67
         final String xmlGuessEnc = pis.getBOMCharsetName();
 428  67
         final String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
 429  
         try {
 430  67
             return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
 431  17
         } catch (final XmlStreamReaderException ex) {
 432  17
             if (lenient) {
 433  0
                 return doLenientDetection(null, ex);
 434  
             } else {
 435  17
                 throw ex;
 436  
             }
 437  
         }
 438  
     }
 439  
 
 440  
     /**
 441  
      * Process a HTTP stream.
 442  
      *
 443  
      * @param bom BOMInputStream to detect byte order marks
 444  
      * @param pis BOMInputStream to guess XML encoding
 445  
      * @param httpContentType The HTTP content type
 446  
      * @param lenient indicates if the charset encoding detection should be
 447  
      *        relaxed.
 448  
      * @return the encoding to be used
 449  
      * @throws IOException thrown if there is a problem reading the stream.
 450  
      */
 451  
     private String doHttpStream(final BOMInputStream bom, final BOMInputStream pis, final String httpContentType,
 452  
             final boolean lenient) throws IOException {
 453  184
         final String bomEnc      = bom.getBOMCharsetName();
 454  184
         final String xmlGuessEnc = pis.getBOMCharsetName();
 455  184
         final String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
 456  
         try {
 457  184
             return calculateHttpEncoding(httpContentType, bomEnc,
 458  
                     xmlGuessEnc, xmlEnc, lenient);
 459  138
         } catch (final XmlStreamReaderException ex) {
 460  138
             if (lenient) {
 461  117
                 return doLenientDetection(httpContentType, ex);
 462  
             } else {
 463  21
                 throw ex;
 464  
             }
 465  
         }
 466  
     }
 467  
 
 468  
     /**
 469  
      * Do lenient detection.
 470  
      *
 471  
      * @param httpContentType content-type header to use for the resolution of
 472  
      *        the charset encoding.
 473  
      * @param ex The thrown exception
 474  
      * @return the encoding
 475  
      * @throws IOException thrown if there is a problem reading the stream.
 476  
      */
 477  
     private String doLenientDetection(String httpContentType,
 478  
             XmlStreamReaderException ex) throws IOException {
 479  117
         if (httpContentType != null && httpContentType.startsWith("text/html")) {
 480  1
             httpContentType = httpContentType.substring("text/html".length());
 481  1
             httpContentType = "text/xml" + httpContentType;
 482  
             try {
 483  1
                 return calculateHttpEncoding(httpContentType, ex.getBomEncoding(),
 484  
                         ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true);
 485  0
             } catch (final XmlStreamReaderException ex2) {
 486  0
                 ex = ex2;
 487  
             }
 488  
         }
 489  116
         String encoding = ex.getXmlEncoding();
 490  116
         if (encoding == null) {
 491  116
             encoding = ex.getContentTypeEncoding();
 492  
         }
 493  116
         if (encoding == null) {
 494  112
             encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
 495  
         }
 496  116
         return encoding;
 497  
     }
 498  
 
 499  
     /**
 500  
      * Calculate the raw encoding.
 501  
      *
 502  
      * @param bomEnc BOM encoding
 503  
      * @param xmlGuessEnc XML Guess encoding
 504  
      * @param xmlEnc XML encoding
 505  
      * @return the raw encoding
 506  
      * @throws IOException thrown if there is a problem reading the stream.
 507  
      */
 508  
     String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc,
 509  
             final String xmlEnc) throws IOException {
 510  
 
 511  
         // BOM is Null
 512  156
         if (bomEnc == null) {
 513  56
             if (xmlGuessEnc == null || xmlEnc == null) {
 514  26
                 return defaultEncoding == null ? UTF_8 : defaultEncoding;
 515  
             }
 516  30
             if (xmlEnc.equals(UTF_16) &&
 517  
                (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
 518  2
                 return xmlGuessEnc;
 519  
             }
 520  28
             return xmlEnc;
 521  
         }
 522  
 
 523  
         // BOM is UTF-8
 524  100
         if (bomEnc.equals(UTF_8)) {
 525  28
             if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
 526  4
                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
 527  4
                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
 528  
             }
 529  24
             if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
 530  10
                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
 531  10
                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
 532  
             }
 533  14
             return bomEnc;
 534  
         }
 535  
 
 536  
         // BOM is UTF-16BE or UTF-16LE
 537  72
         if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
 538  35
             if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
 539  6
                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
 540  6
                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
 541  
             }
 542  29
             if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
 543  14
                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
 544  14
                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
 545  
             }
 546  15
             return bomEnc;
 547  
         }
 548  
 
 549  
         // BOM is UTF-32BE or UTF-32LE
 550  37
         if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) {
 551  36
             if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
 552  6
                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
 553  6
                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
 554  
             }
 555  30
             if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) {
 556  14
                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
 557  14
                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
 558  
             }
 559  16
             return bomEnc;
 560  
         }
 561  
 
 562  
         // BOM is something else
 563  1
         final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc);
 564  1
         throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
 565  
     }
 566  
 
 567  
 
 568  
     /**
 569  
      * Calculate the HTTP encoding.
 570  
      *
 571  
      * @param httpContentType The HTTP content type
 572  
      * @param bomEnc BOM encoding
 573  
      * @param xmlGuessEnc XML Guess encoding
 574  
      * @param xmlEnc XML encoding
 575  
      * @param lenient indicates if the charset encoding detection should be
 576  
      *        relaxed.
 577  
      * @return the HTTP encoding
 578  
      * @throws IOException thrown if there is a problem reading the stream.
 579  
      */
 580  
     String calculateHttpEncoding(final String httpContentType,
 581  
             final String bomEnc, final String xmlGuessEnc, final String xmlEnc,
 582  
             final boolean lenient) throws IOException {
 583  
 
 584  
         // Lenient and has XML encoding
 585  217
         if (lenient && xmlEnc != null) {
 586  19
             return xmlEnc;
 587  
         }
 588  
 
 589  
         // Determine mime/encoding content types from HTTP Content Type
 590  198
         final String cTMime = getContentTypeMime(httpContentType);
 591  198
         final String cTEnc  = getContentTypeEncoding(httpContentType);
 592  198
         final boolean appXml  = isAppXml(cTMime);
 593  198
         final boolean textXml = isTextXml(cTMime);
 594  
 
 595  
         // Mime type NOT "application/xml" or "text/xml"
 596  198
         if (!appXml && !textXml) {
 597  116
             final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
 598  116
             throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
 599  
         }
 600  
 
 601  
         // No content type encoding
 602  82
         if (cTEnc == null) {
 603  21
             if (appXml) {
 604  10
                 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
 605  
             } else {
 606  11
                 return defaultEncoding == null ? US_ASCII : defaultEncoding;
 607  
             }
 608  
         }
 609  
 
 610  
         // UTF-16BE or UTF-16LE content type encoding
 611  61
         if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
 612  11
             if (bomEnc != null) {
 613  9
                 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
 614  9
                 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
 615  
             }
 616  2
             return cTEnc;
 617  
         }
 618  
 
 619  
         // UTF-16 content type encoding
 620  50
         if (cTEnc.equals(UTF_16)) {
 621  16
             if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
 622  9
                 return bomEnc;
 623  
             }
 624  7
             final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
 625  7
             throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
 626  
         }
 627  
 
 628  
         // UTF-32BE or UTF-132E content type encoding
 629  34
         if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) {
 630  11
             if (bomEnc != null) {
 631  9
                 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
 632  9
                 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
 633  
             }
 634  2
             return cTEnc;
 635  
         }
 636  
 
 637  
         // UTF-32 content type encoding
 638  23
         if (cTEnc.equals(UTF_32)) {
 639  13
             if (bomEnc != null && bomEnc.startsWith(UTF_32)) {
 640  6
                 return bomEnc;
 641  
             }
 642  7
             final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
 643  7
             throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
 644  
         }
 645  
 
 646  10
         return cTEnc;
 647  
     }
 648  
 
 649  
     /**
 650  
      * Returns MIME type or NULL if httpContentType is NULL.
 651  
      *
 652  
      * @param httpContentType the HTTP content type
 653  
      * @return The mime content type
 654  
      */
 655  
     static String getContentTypeMime(final String httpContentType) {
 656  232
         String mime = null;
 657  232
         if (httpContentType != null) {
 658  113
             final int i = httpContentType.indexOf(";");
 659  113
             if (i >= 0) {
 660  83
                 mime = httpContentType.substring(0, i);
 661  
             } else {
 662  30
                 mime = httpContentType;
 663  
             }
 664  113
             mime = mime.trim();
 665  
         }
 666  232
         return mime;
 667  
     }
 668  
 
 669  5
     private static final Pattern CHARSET_PATTERN = Pattern
 670  
             .compile("charset=[\"']?([.[^; \"']]*)[\"']?");
 671  
 
 672  
     /**
 673  
      * Returns charset parameter value, NULL if not present, NULL if
 674  
      * httpContentType is NULL.
 675  
      *
 676  
      * @param httpContentType the HTTP content type
 677  
      * @return The content type encoding (upcased)
 678  
      */
 679  
     static String getContentTypeEncoding(final String httpContentType) {
 680  250
         String encoding = null;
 681  250
         if (httpContentType != null) {
 682  131
             final int i = httpContentType.indexOf(";");
 683  131
             if (i > -1) {
 684  101
                 final String postMime = httpContentType.substring(i + 1);
 685  101
                 final Matcher m = CHARSET_PATTERN.matcher(postMime);
 686  101
                 encoding = m.find() ? m.group(1) : null;
 687  101
                 encoding = encoding != null ? encoding.toUpperCase(Locale.US) : null;
 688  
             }
 689  
         }
 690  250
         return encoding;
 691  
     }
 692  
 
 693  5
     public static final Pattern ENCODING_PATTERN = Pattern.compile(
 694  
             "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))",
 695  
             Pattern.MULTILINE);
 696  
 
 697  
     /**
 698  
      * Returns the encoding declared in the <?xml encoding=...?>, NULL if none.
 699  
      *
 700  
      * @param is InputStream to create the reader from.
 701  
      * @param guessedEnc guessed encoding
 702  
      * @return the encoding declared in the <?xml encoding=...?>
 703  
      * @throws IOException thrown if there is a problem reading the stream.
 704  
      */
 705  
     private static String getXmlProlog(final InputStream is, final String guessedEnc)
 706  
             throws IOException {
 707  251
         String encoding = null;
 708  251
         if (guessedEnc != null) {
 709  116
             final byte[] bytes = new byte[BUFFER_SIZE];
 710  116
             is.mark(BUFFER_SIZE);
 711  116
             int offset = 0;
 712  116
             int max = BUFFER_SIZE;
 713  116
             int c = is.read(bytes, offset, max);
 714  116
             int firstGT = -1;
 715  116
             String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning)
 716  232
             while (c != -1 && firstGT == -1 && offset < BUFFER_SIZE) {
 717  116
                 offset += c;
 718  116
                 max -= c;
 719  116
                 c = is.read(bytes, offset, max);
 720  116
                 xmlProlog = new String(bytes, 0, offset, guessedEnc);
 721  116
                 firstGT = xmlProlog.indexOf('>');
 722  
             }
 723  116
             if (firstGT == -1) {
 724  0
                 if (c == -1) {
 725  0
                     throw new IOException("Unexpected end of XML stream");
 726  
                 } else {
 727  0
                     throw new IOException(
 728  
                             "XML prolog or ROOT element not found on first "
 729  
                                     + offset + " bytes");
 730  
                 }
 731  
             }
 732  116
             final int bytesRead = offset;
 733  116
             if (bytesRead > 0) {
 734  116
                 is.reset();
 735  116
                 final BufferedReader bReader = new BufferedReader(new StringReader(
 736  
                         xmlProlog.substring(0, firstGT + 1)));
 737  116
                 final StringBuffer prolog = new StringBuffer();
 738  116
                 String line = bReader.readLine();
 739  248
                 while (line != null) {
 740  132
                     prolog.append(line);
 741  132
                     line = bReader.readLine();
 742  
                 }
 743  116
                 final Matcher m = ENCODING_PATTERN.matcher(prolog);
 744  116
                 if (m.find()) {
 745  91
                     encoding = m.group(1).toUpperCase();
 746  91
                     encoding = encoding.substring(1, encoding.length() - 1);
 747  
                 }
 748  
             }
 749  
         }
 750  251
         return encoding;
 751  
     }
 752  
 
 753  
     /**
 754  
      * Indicates if the MIME type belongs to the APPLICATION XML family.
 755  
      *
 756  
      * @param mime The mime type
 757  
      * @return true if the mime type belongs to the APPLICATION XML family,
 758  
      * otherwise false
 759  
      */
 760  
     static boolean isAppXml(final String mime) {
 761  222
         return mime != null &&
 762  
                (mime.equals("application/xml") ||
 763  
                 mime.equals("application/xml-dtd") ||
 764  
                 mime.equals("application/xml-external-parsed-entity") ||
 765  
                mime.startsWith("application/") && mime.endsWith("+xml"));
 766  
     }
 767  
 
 768  
     /**
 769  
      * Indicates if the MIME type belongs to the TEXT XML family.
 770  
      *
 771  
      * @param mime The mime type
 772  
      * @return true if the mime type belongs to the TEXT XML family,
 773  
      * otherwise false
 774  
      */
 775  
     static boolean isTextXml(final String mime) {
 776  216
         return mime != null &&
 777  
               (mime.equals("text/xml") ||
 778  
                mime.equals("text/xml-external-parsed-entity") ||
 779  
               mime.startsWith("text/") && mime.endsWith("+xml"));
 780  
     }
 781  
 
 782  
     private static final String RAW_EX_1 =
 783  
         "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
 784  
 
 785  
     private static final String RAW_EX_2 =
 786  
         "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";
 787  
 
 788  
     private static final String HTTP_EX_1 =
 789  
         "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL";
 790  
 
 791  
     private static final String HTTP_EX_2 =
 792  
         "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";
 793  
 
 794  
     private static final String HTTP_EX_3 =
 795  
         "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME";
 796  
 
 797  
 }