Coverage Report - org.apache.commons.io.input.XmlStreamReader
 
Classes in this File Line Coverage Branch Coverage Complexity
XmlStreamReader
88%
170/193
92%
148/160
6
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *      http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 package org.apache.commons.io.input;
 18  
 
 19  
 import java.io.BufferedInputStream;
 20  
 import java.io.BufferedReader;
 21  
 import java.io.File;
 22  
 import java.io.FileInputStream;
 23  
 import java.io.IOException;
 24  
 import java.io.InputStream;
 25  
 import java.io.InputStreamReader;
 26  
 import java.io.Reader;
 27  
 import java.io.StringReader;
 28  
 import java.net.HttpURLConnection;
 29  
 import java.net.URL;
 30  
 import java.net.URLConnection;
 31  
 import java.text.MessageFormat;
 32  
 import java.util.Locale;
 33  
 import java.util.regex.Matcher;
 34  
 import java.util.regex.Pattern;
 35  
 
 36  
 import org.apache.commons.io.ByteOrderMark;
 37  
 
 38  
 /**
 39  
  * Character stream that handles all the necessary Voodo to figure out the
 40  
  * charset encoding of the XML document within the stream.
 41  
  * <p>
 42  
  * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader.
 43  
  * This one IS a character stream.
 44  
  * <p>
 45  
  * All this has to be done without consuming characters from the stream, if not
 46  
  * the XML parser will not recognized the document as a valid XML. This is not
 47  
  * 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers
 48  
  * right now, XmlStreamReader handles it and things work in all parsers).
 49  
  * <p>
 50  
  * The XmlStreamReader class handles the charset encoding of XML documents in
 51  
  * Files, raw streams and HTTP streams by offering a wide set of constructors.
 52  
  * <p>
 53  
  * By default the charset encoding detection is lenient, the constructor with
 54  
  * the lenient flag can be used for an script (following HTTP MIME and XML
 55  
  * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a
 56  
  * href="http://diveintomark.org/archives/2004/02/13/xml-media-types">
 57  
  * Determining the character encoding of a feed</a>.
 58  
  * <p>
 59  
  * Originally developed for <a href="http://rome.dev.java.net">ROME</a> under
 60  
  * Apache License 2.0.
 61  
  *
 62  
  * @version $Id: XmlStreamReader.java 1471788 2013-04-24 23:58:56Z sebb $
 63  
  * @see org.apache.commons.io.output.XmlStreamWriter
 64  
  * @since 2.0
 65  
  */
 66  
 public class XmlStreamReader extends Reader {
 67  
     private static final int BUFFER_SIZE = 4096;
 68  
 
 69  
     private static final String UTF_8 = "UTF-8";
 70  
 
 71  
     private static final String US_ASCII = "US-ASCII";
 72  
 
 73  
     private static final String UTF_16BE = "UTF-16BE";
 74  
 
 75  
     private static final String UTF_16LE = "UTF-16LE";
 76  
 
 77  
     private static final String UTF_32BE = "UTF-32BE";
 78  
 
 79  
     private static final String UTF_32LE = "UTF-32LE";
 80  
 
 81  
     private static final String UTF_16 = "UTF-16";
 82  
 
 83  
     private static final String UTF_32 = "UTF-32";
 84  
 
 85  
     private static final String EBCDIC = "CP1047";
 86  
 
 87  10
     private static final ByteOrderMark[] BOMS = new ByteOrderMark[] {
 88  
         ByteOrderMark.UTF_8,
 89  
         ByteOrderMark.UTF_16BE,
 90  
         ByteOrderMark.UTF_16LE,
 91  
         ByteOrderMark.UTF_32BE,
 92  
         ByteOrderMark.UTF_32LE
 93  
     };
 94  
 
 95  
     // UTF_16LE and UTF_32LE have the same two starting BOM bytes.
 96  10
     private static final ByteOrderMark[] XML_GUESS_BYTES = new ByteOrderMark[] {
 97  
         new ByteOrderMark(UTF_8,    0x3C, 0x3F, 0x78, 0x6D),
 98  
         new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F),
 99  
         new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00),
 100  
         new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C,
 101  
                 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D),
 102  
         new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00,
 103  
                 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00),
 104  
         new ByteOrderMark(EBCDIC,   0x4C, 0x6F, 0xA7, 0x94)
 105  
     };
 106  
 
 107  
     private final Reader reader;
 108  
 
 109  
     private final String encoding;
 110  
 
 111  
     private final String defaultEncoding;
 112  
 
 113  
     /**
 114  
      * Returns the default encoding to use if none is set in HTTP content-type,
 115  
      * XML prolog and the rules based on content-type are not adequate.
 116  
      * <p>
 117  
      * If it is NULL the content-type based rules are used.
 118  
      *
 119  
      * @return the default encoding to use.
 120  
      */
 121  
     public String getDefaultEncoding() {
 122  0
         return defaultEncoding;
 123  
     }
 124  
 
 125  
     /**
 126  
      * Creates a Reader for a File.
 127  
      * <p>
 128  
      * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset,
 129  
      * if this is also missing defaults to UTF-8.
 130  
      * <p>
 131  
      * It does a lenient charset encoding detection, check the constructor with
 132  
      * the lenient parameter for details.
 133  
      *
 134  
      * @param file File to create a Reader from.
 135  
      * @throws IOException thrown if there is a problem reading the file.
 136  
      */
 137  
     public XmlStreamReader(final File file) throws IOException {
 138  0
         this(new FileInputStream(file));
 139  0
     }
 140  
 
 141  
     /**
 142  
      * Creates a Reader for a raw InputStream.
 143  
      * <p>
 144  
      * It follows the same logic used for files.
 145  
      * <p>
 146  
      * It does a lenient charset encoding detection, check the constructor with
 147  
      * the lenient parameter for details.
 148  
      *
 149  
      * @param is InputStream to create a Reader from.
 150  
      * @throws IOException thrown if there is a problem reading the stream.
 151  
      */
 152  
     public XmlStreamReader(final InputStream is) throws IOException {
 153  70
         this(is, true);
 154  70
     }
 155  
 
 156  
     /**
 157  
      * Creates a Reader for a raw InputStream.
 158  
      * <p>
 159  
      * It follows the same logic used for files.
 160  
      * <p>
 161  
      * If lenient detection is indicated and the detection above fails as per
 162  
      * specifications it then attempts the following:
 163  
      * <p>
 164  
      * If the content type was 'text/html' it replaces it with 'text/xml' and
 165  
      * tries the detection again.
 166  
      * <p>
 167  
      * Else if the XML prolog had a charset encoding that encoding is used.
 168  
      * <p>
 169  
      * Else if the content type had a charset encoding that encoding is used.
 170  
      * <p>
 171  
      * Else 'UTF-8' is used.
 172  
      * <p>
 173  
      * If lenient detection is indicated an XmlStreamReaderException is never
 174  
      * thrown.
 175  
      *
 176  
      * @param is InputStream to create a Reader from.
 177  
      * @param lenient indicates if the charset encoding detection should be
 178  
      *        relaxed.
 179  
      * @throws IOException thrown if there is a problem reading the stream.
 180  
      * @throws XmlStreamReaderException thrown if the charset encoding could not
 181  
      *         be determined according to the specs.
 182  
      */
 183  
     public XmlStreamReader(final InputStream is, final boolean lenient) throws IOException {
 184  134
         this(is, lenient, null);
 185  100
     }
 186  
 
 187  
     /**
 188  
      * Creates a Reader for a raw InputStream.
 189  
      * <p>
 190  
      * It follows the same logic used for files.
 191  
      * <p>
 192  
      * If lenient detection is indicated and the detection above fails as per
 193  
      * specifications it then attempts the following:
 194  
      * <p>
 195  
      * If the content type was 'text/html' it replaces it with 'text/xml' and
 196  
      * tries the detection again.
 197  
      * <p>
 198  
      * Else if the XML prolog had a charset encoding that encoding is used.
 199  
      * <p>
 200  
      * Else if the content type had a charset encoding that encoding is used.
 201  
      * <p>
 202  
      * Else 'UTF-8' is used.
 203  
      * <p>
 204  
      * If lenient detection is indicated an XmlStreamReaderException is never
 205  
      * thrown.
 206  
      *
 207  
      * @param is InputStream to create a Reader from.
 208  
      * @param lenient indicates if the charset encoding detection should be
 209  
      *        relaxed.
 210  
      * @param defaultEncoding The default encoding
 211  
      * @throws IOException thrown if there is a problem reading the stream.
 212  
      * @throws XmlStreamReaderException thrown if the charset encoding could not
 213  
      *         be determined according to the specs.
 214  
      */
 215  134
     public XmlStreamReader(final InputStream is, final boolean lenient, final String defaultEncoding) throws IOException {
 216  134
         this.defaultEncoding = defaultEncoding;
 217  134
         final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS);
 218  134
         final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
 219  134
         this.encoding = doRawStream(bom, pis, lenient);
 220  100
         this.reader = new InputStreamReader(pis, encoding);
 221  100
     }
 222  
 
 223  
     /**
 224  
      * Creates a Reader using the InputStream of a URL.
 225  
      * <p>
 226  
      * If the URL is not of type HTTP and there is not 'content-type' header in
 227  
      * the fetched data it uses the same logic used for Files.
 228  
      * <p>
 229  
      * If the URL is a HTTP Url or there is a 'content-type' header in the
 230  
      * fetched data it uses the same logic used for an InputStream with
 231  
      * content-type.
 232  
      * <p>
 233  
      * It does a lenient charset encoding detection, check the constructor with
 234  
      * the lenient parameter for details.
 235  
      *
 236  
      * @param url URL to create a Reader from.
 237  
      * @throws IOException thrown if there is a problem reading the stream of
 238  
      *         the URL.
 239  
      */
 240  
     public XmlStreamReader(final URL url) throws IOException {
 241  0
         this(url.openConnection(), null);
 242  0
     }
 243  
 
 244  
     /**
 245  
      * Creates a Reader using the InputStream of a URLConnection.
 246  
      * <p>
 247  
      * If the URLConnection is not of type HttpURLConnection and there is not
 248  
      * 'content-type' header in the fetched data it uses the same logic used for
 249  
      * files.
 250  
      * <p>
 251  
      * If the URLConnection is a HTTP Url or there is a 'content-type' header in
 252  
      * the fetched data it uses the same logic used for an InputStream with
 253  
      * content-type.
 254  
      * <p>
 255  
      * It does a lenient charset encoding detection, check the constructor with
 256  
      * the lenient parameter for details.
 257  
      *
 258  
      * @param conn URLConnection to create a Reader from.
 259  
      * @param defaultEncoding The default encoding
 260  
      * @throws IOException thrown if there is a problem reading the stream of
 261  
      *         the URLConnection.
 262  
      */
 263  0
     public XmlStreamReader(final URLConnection conn, final String defaultEncoding) throws IOException {
 264  0
         this.defaultEncoding = defaultEncoding;
 265  0
         final boolean lenient = true;
 266  0
         final String contentType = conn.getContentType();
 267  0
         final InputStream is = conn.getInputStream();
 268  0
         final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS);
 269  0
         final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
 270  0
         if (conn instanceof HttpURLConnection || contentType != null) {
 271  0
             this.encoding = doHttpStream(bom, pis, contentType, lenient);
 272  
         } else {
 273  0
             this.encoding = doRawStream(bom, pis, lenient);
 274  
         }
 275  0
         this.reader = new InputStreamReader(pis, encoding);
 276  0
     }
 277  
 
 278  
     /**
 279  
      * Creates a Reader using an InputStream an the associated content-type
 280  
      * header.
 281  
      * <p>
 282  
      * First it checks if the stream has BOM. If there is not BOM checks the
 283  
      * content-type encoding. If there is not content-type encoding checks the
 284  
      * XML prolog encoding. If there is not XML prolog encoding uses the default
 285  
      * encoding mandated by the content-type MIME type.
 286  
      * <p>
 287  
      * It does a lenient charset encoding detection, check the constructor with
 288  
      * the lenient parameter for details.
 289  
      *
 290  
      * @param is InputStream to create the reader from.
 291  
      * @param httpContentType content-type header to use for the resolution of
 292  
      *        the charset encoding.
 293  
      * @throws IOException thrown if there is a problem reading the file.
 294  
      */
 295  
     public XmlStreamReader(final InputStream is, final String httpContentType)
 296  
             throws IOException {
 297  2
         this(is, httpContentType, true);
 298  2
     }
 299  
 
 300  
     /**
 301  
      * Creates a Reader using an InputStream an the associated content-type
 302  
      * header. This constructor is lenient regarding the encoding detection.
 303  
      * <p>
 304  
      * First it checks if the stream has BOM. If there is not BOM checks the
 305  
      * content-type encoding. If there is not content-type encoding checks the
 306  
      * XML prolog encoding. If there is not XML prolog encoding uses the default
 307  
      * encoding mandated by the content-type MIME type.
 308  
      * <p>
 309  
      * If lenient detection is indicated and the detection above fails as per
 310  
      * specifications it then attempts the following:
 311  
      * <p>
 312  
      * If the content type was 'text/html' it replaces it with 'text/xml' and
 313  
      * tries the detection again.
 314  
      * <p>
 315  
      * Else if the XML prolog had a charset encoding that encoding is used.
 316  
      * <p>
 317  
      * Else if the content type had a charset encoding that encoding is used.
 318  
      * <p>
 319  
      * Else 'UTF-8' is used.
 320  
      * <p>
 321  
      * If lenient detection is indicated an XmlStreamReaderException is never
 322  
      * thrown.
 323  
      *
 324  
      * @param is InputStream to create the reader from.
 325  
      * @param httpContentType content-type header to use for the resolution of
 326  
      *        the charset encoding.
 327  
      * @param lenient indicates if the charset encoding detection should be
 328  
      *        relaxed.
 329  
      * @param defaultEncoding The default encoding
 330  
      * @throws IOException thrown if there is a problem reading the file.
 331  
      * @throws XmlStreamReaderException thrown if the charset encoding could not
 332  
      *         be determined according to the specs.
 333  
      */
 334  
     public XmlStreamReader(final InputStream is, final String httpContentType,
 335  368
             final boolean lenient, final String defaultEncoding) throws IOException {
 336  368
         this.defaultEncoding = defaultEncoding;
 337  368
         final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS);
 338  368
         final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
 339  368
         this.encoding = doHttpStream(bom, pis, httpContentType, lenient);
 340  326
         this.reader = new InputStreamReader(pis, encoding);
 341  326
     }
 342  
 
 343  
     /**
 344  
      * Creates a Reader using an InputStream an the associated content-type
 345  
      * header. This constructor is lenient regarding the encoding detection.
 346  
      * <p>
 347  
      * First it checks if the stream has BOM. If there is not BOM checks the
 348  
      * content-type encoding. If there is not content-type encoding checks the
 349  
      * XML prolog encoding. If there is not XML prolog encoding uses the default
 350  
      * encoding mandated by the content-type MIME type.
 351  
      * <p>
 352  
      * If lenient detection is indicated and the detection above fails as per
 353  
      * specifications it then attempts the following:
 354  
      * <p>
 355  
      * If the content type was 'text/html' it replaces it with 'text/xml' and
 356  
      * tries the detection again.
 357  
      * <p>
 358  
      * Else if the XML prolog had a charset encoding that encoding is used.
 359  
      * <p>
 360  
      * Else if the content type had a charset encoding that encoding is used.
 361  
      * <p>
 362  
      * Else 'UTF-8' is used.
 363  
      * <p>
 364  
      * If lenient detection is indicated an XmlStreamReaderException is never
 365  
      * thrown.
 366  
      *
 367  
      * @param is InputStream to create the reader from.
 368  
      * @param httpContentType content-type header to use for the resolution of
 369  
      *        the charset encoding.
 370  
      * @param lenient indicates if the charset encoding detection should be
 371  
      *        relaxed.
 372  
      * @throws IOException thrown if there is a problem reading the file.
 373  
      * @throws XmlStreamReaderException thrown if the charset encoding could not
 374  
      *         be determined according to the specs.
 375  
      */
 376  
     public XmlStreamReader(final InputStream is, final String httpContentType,
 377  
             final boolean lenient) throws IOException {
 378  134
         this(is, httpContentType, lenient, null);
 379  92
     }
 380  
 
 381  
     /**
 382  
      * Returns the charset encoding of the XmlStreamReader.
 383  
      *
 384  
      * @return charset encoding.
 385  
      */
 386  
     public String getEncoding() {
 387  200
         return encoding;
 388  
     }
 389  
 
 390  
     /**
 391  
      * Invokes the underlying reader's <code>read(char[], int, int)</code> method.
 392  
      * @param buf the buffer to read the characters into
 393  
      * @param offset The start offset
 394  
      * @param len The number of bytes to read
 395  
      * @return the number of characters read or -1 if the end of stream
 396  
      * @throws IOException if an I/O error occurs
 397  
      */
 398  
     @Override
 399  
     public int read(final char[] buf, final int offset, final int len) throws IOException {
 400  16
         return reader.read(buf, offset, len);
 401  
     }
 402  
 
 403  
     /**
 404  
      * Closes the XmlStreamReader stream.
 405  
      *
 406  
      * @throws IOException thrown if there was a problem closing the stream.
 407  
      */
 408  
     @Override
 409  
     public void close() throws IOException {
 410  326
         reader.close();
 411  326
     }
 412  
 
 413  
     /**
 414  
      * Process the raw stream.
 415  
      *
 416  
      * @param bom BOMInputStream to detect byte order marks
 417  
      * @param pis BOMInputStream to guess XML encoding
 418  
      * @param lenient indicates if the charset encoding detection should be
 419  
      *        relaxed.
 420  
      * @return the encoding to be used
 421  
      * @throws IOException thrown if there is a problem reading the stream.
 422  
      */
 423  
     private String doRawStream(final BOMInputStream bom, final BOMInputStream pis, final boolean lenient)
 424  
             throws IOException {
 425  134
         final String bomEnc      = bom.getBOMCharsetName();
 426  134
         final String xmlGuessEnc = pis.getBOMCharsetName();
 427  134
         final String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
 428  
         try {
 429  134
             return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
 430  34
         } catch (final XmlStreamReaderException ex) {
 431  34
             if (lenient) {
 432  0
                 return doLenientDetection(null, ex);
 433  
             } else {
 434  34
                 throw ex;
 435  
             }
 436  
         }
 437  
     }
 438  
 
 439  
     /**
 440  
      * Process a HTTP stream.
 441  
      *
 442  
      * @param bom BOMInputStream to detect byte order marks
 443  
      * @param pis BOMInputStream to guess XML encoding
 444  
      * @param httpContentType The HTTP content type
 445  
      * @param lenient indicates if the charset encoding detection should be
 446  
      *        relaxed.
 447  
      * @return the encoding to be used
 448  
      * @throws IOException thrown if there is a problem reading the stream.
 449  
      */
 450  
     private String doHttpStream(final BOMInputStream bom, final BOMInputStream pis, final String httpContentType,
 451  
             final boolean lenient) throws IOException {
 452  368
         final String bomEnc      = bom.getBOMCharsetName();
 453  368
         final String xmlGuessEnc = pis.getBOMCharsetName();
 454  368
         final String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
 455  
         try {
 456  368
             return calculateHttpEncoding(httpContentType, bomEnc,
 457  
                     xmlGuessEnc, xmlEnc, lenient);
 458  276
         } catch (final XmlStreamReaderException ex) {
 459  276
             if (lenient) {
 460  234
                 return doLenientDetection(httpContentType, ex);
 461  
             } else {
 462  42
                 throw ex;
 463  
             }
 464  
         }
 465  
     }
 466  
 
 467  
     /**
 468  
      * Do lenient detection.
 469  
      *
 470  
      * @param httpContentType content-type header to use for the resolution of
 471  
      *        the charset encoding.
 472  
      * @param ex The thrown exception
 473  
      * @return the encoding
 474  
      * @throws IOException thrown if there is a problem reading the stream.
 475  
      */
 476  
     private String doLenientDetection(String httpContentType,
 477  
             XmlStreamReaderException ex) throws IOException {
 478  234
         if (httpContentType != null && httpContentType.startsWith("text/html")) {
 479  2
             httpContentType = httpContentType.substring("text/html".length());
 480  2
             httpContentType = "text/xml" + httpContentType;
 481  
             try {
 482  2
                 return calculateHttpEncoding(httpContentType, ex.getBomEncoding(),
 483  
                         ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true);
 484  0
             } catch (final XmlStreamReaderException ex2) {
 485  0
                 ex = ex2;
 486  
             }
 487  
         }
 488  232
         String encoding = ex.getXmlEncoding();
 489  232
         if (encoding == null) {
 490  232
             encoding = ex.getContentTypeEncoding();
 491  
         }
 492  232
         if (encoding == null) {
 493  224
             encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
 494  
         }
 495  232
         return encoding;
 496  
     }
 497  
 
 498  
     /**
 499  
      * Calculate the raw encoding.
 500  
      *
 501  
      * @param bomEnc BOM encoding
 502  
      * @param xmlGuessEnc XML Guess encoding
 503  
      * @param xmlEnc XML encoding
 504  
      * @return the raw encoding
 505  
      * @throws IOException thrown if there is a problem reading the stream.
 506  
      */
 507  
     String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc,
 508  
             final String xmlEnc) throws IOException {
 509  
 
 510  
         // BOM is Null
 511  312
         if (bomEnc == null) {
 512  112
             if (xmlGuessEnc == null || xmlEnc == null) {
 513  52
                 return defaultEncoding == null ? UTF_8 : defaultEncoding;
 514  
             }
 515  60
             if (xmlEnc.equals(UTF_16) &&
 516  
                (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
 517  4
                 return xmlGuessEnc;
 518  
             }
 519  56
             return xmlEnc;
 520  
         }
 521  
 
 522  
         // BOM is UTF-8
 523  200
         if (bomEnc.equals(UTF_8)) {
 524  56
             if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
 525  8
                 final String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
 526  8
                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
 527  
             }
 528  48
             if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
 529  20
                 final String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
 530  20
                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
 531  
             }
 532  28
             return bomEnc;
 533  
         }
 534  
 
 535  
         // BOM is UTF-16BE or UTF-16LE
 536  144
         if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
 537  70
             if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
 538  12
                 final String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
 539  12
                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
 540  
             }
 541  58
             if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
 542  28
                 final String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
 543  28
                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
 544  
             }
 545  30
             return bomEnc;
 546  
         }
 547  
 
 548  
         // BOM is UTF-32BE or UTF-32LE
 549  74
         if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) {
 550  72
             if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
 551  12
                 final String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
 552  12
                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
 553  
             }
 554  60
             if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) {
 555  28
                 final String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
 556  28
                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
 557  
             }
 558  32
             return bomEnc;
 559  
         }
 560  
 
 561  
         // BOM is something else
 562  2
         final String msg = MessageFormat.format(RAW_EX_2, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
 563  2
         throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
 564  
     }
 565  
 
 566  
 
 567  
     /**
 568  
      * Calculate the HTTP encoding.
 569  
      *
 570  
      * @param httpContentType The HTTP content type
 571  
      * @param bomEnc BOM encoding
 572  
      * @param xmlGuessEnc XML Guess encoding
 573  
      * @param xmlEnc XML encoding
 574  
      * @param lenient indicates if the charset encoding detection should be
 575  
      *        relaxed.
 576  
      * @return the HTTP encoding
 577  
      * @throws IOException thrown if there is a problem reading the stream.
 578  
      */
 579  
     String calculateHttpEncoding(final String httpContentType,
 580  
             final String bomEnc, final String xmlGuessEnc, final String xmlEnc,
 581  
             final boolean lenient) throws IOException {
 582  
 
 583  
         // Lenient and has XML encoding
 584  434
         if (lenient && xmlEnc != null) {
 585  38
             return xmlEnc;
 586  
         }
 587  
 
 588  
         // Determine mime/encoding content types from HTTP Content Type
 589  396
         final String cTMime = getContentTypeMime(httpContentType);
 590  396
         final String cTEnc  = getContentTypeEncoding(httpContentType);
 591  396
         final boolean appXml  = isAppXml(cTMime);
 592  396
         final boolean textXml = isTextXml(cTMime);
 593  
 
 594  
         // Mime type NOT "application/xml" or "text/xml"
 595  396
         if (!appXml && !textXml) {
 596  232
             final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
 597  232
             throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
 598  
         }
 599  
 
 600  
         // No content type encoding
 601  164
         if (cTEnc == null) {
 602  42
             if (appXml) {
 603  20
                 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
 604  
             } else {
 605  22
                 return defaultEncoding == null ? US_ASCII : defaultEncoding;
 606  
             }
 607  
         }
 608  
 
 609  
         // UTF-16BE or UTF-16LE content type encoding
 610  122
         if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
 611  22
             if (bomEnc != null) {
 612  18
                 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
 613  18
                 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
 614  
             }
 615  4
             return cTEnc;
 616  
         }
 617  
 
 618  
         // UTF-16 content type encoding
 619  100
         if (cTEnc.equals(UTF_16)) {
 620  32
             if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
 621  18
                 return bomEnc;
 622  
             }
 623  14
             final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
 624  14
             throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
 625  
         }
 626  
 
 627  
         // UTF-32BE or UTF-132E content type encoding
 628  68
         if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) {
 629  22
             if (bomEnc != null) {
 630  18
                 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
 631  18
                 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
 632  
             }
 633  4
             return cTEnc;
 634  
         }
 635  
 
 636  
         // UTF-32 content type encoding
 637  46
         if (cTEnc.equals(UTF_32)) {
 638  26
             if (bomEnc != null && bomEnc.startsWith(UTF_32)) {
 639  12
                 return bomEnc;
 640  
             }
 641  14
             final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
 642  14
             throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
 643  
         }
 644  
 
 645  20
         return cTEnc;
 646  
     }
 647  
 
 648  
     /**
 649  
      * Returns MIME type or NULL if httpContentType is NULL.
 650  
      *
 651  
      * @param httpContentType the HTTP content type
 652  
      * @return The mime content type
 653  
      */
 654  
     static String getContentTypeMime(final String httpContentType) {
 655  464
         String mime = null;
 656  464
         if (httpContentType != null) {
 657  226
             final int i = httpContentType.indexOf(";");
 658  226
             if (i >= 0) {
 659  166
                 mime = httpContentType.substring(0, i);
 660  
             } else {
 661  60
                 mime = httpContentType;
 662  
             }
 663  226
             mime = mime.trim();
 664  
         }
 665  464
         return mime;
 666  
     }
 667  
 
 668  10
     private static final Pattern CHARSET_PATTERN = Pattern
 669  
             .compile("charset=[\"']?([.[^; \"']]*)[\"']?");
 670  
 
 671  
     /**
 672  
      * Returns charset parameter value, NULL if not present, NULL if
 673  
      * httpContentType is NULL.
 674  
      *
 675  
      * @param httpContentType the HTTP content type
 676  
      * @return The content type encoding (upcased)
 677  
      */
 678  
     static String getContentTypeEncoding(final String httpContentType) {
 679  500
         String encoding = null;
 680  500
         if (httpContentType != null) {
 681  262
             final int i = httpContentType.indexOf(";");
 682  262
             if (i > -1) {
 683  202
                 final String postMime = httpContentType.substring(i + 1);
 684  202
                 final Matcher m = CHARSET_PATTERN.matcher(postMime);
 685  202
                 encoding = m.find() ? m.group(1) : null;
 686  202
                 encoding = encoding != null ? encoding.toUpperCase(Locale.US) : null;
 687  
             }
 688  
         }
 689  500
         return encoding;
 690  
     }
 691  
 
 692  10
     public static final Pattern ENCODING_PATTERN = Pattern.compile(
 693  
             "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))",
 694  
             Pattern.MULTILINE);
 695  
 
 696  
     /**
 697  
      * Returns the encoding declared in the <?xml encoding=...?>, NULL if none.
 698  
      *
 699  
      * @param is InputStream to create the reader from.
 700  
      * @param guessedEnc guessed encoding
 701  
      * @return the encoding declared in the <?xml encoding=...?>
 702  
      * @throws IOException thrown if there is a problem reading the stream.
 703  
      */
 704  
     private static String getXmlProlog(final InputStream is, final String guessedEnc)
 705  
             throws IOException {
 706  502
         String encoding = null;
 707  502
         if (guessedEnc != null) {
 708  232
             final byte[] bytes = new byte[BUFFER_SIZE];
 709  232
             is.mark(BUFFER_SIZE);
 710  232
             int offset = 0;
 711  232
             int max = BUFFER_SIZE;
 712  232
             int c = is.read(bytes, offset, max);
 713  232
             int firstGT = -1;
 714  232
             String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning)
 715  464
             while (c != -1 && firstGT == -1 && offset < BUFFER_SIZE) {
 716  232
                 offset += c;
 717  232
                 max -= c;
 718  232
                 c = is.read(bytes, offset, max);
 719  232
                 xmlProlog = new String(bytes, 0, offset, guessedEnc);
 720  232
                 firstGT = xmlProlog.indexOf('>');
 721  
             }
 722  232
             if (firstGT == -1) {
 723  0
                 if (c == -1) {
 724  0
                     throw new IOException("Unexpected end of XML stream");
 725  
                 } else {
 726  0
                     throw new IOException(
 727  
                             "XML prolog or ROOT element not found on first "
 728  
                                     + offset + " bytes");
 729  
                 }
 730  
             }
 731  232
             final int bytesRead = offset;
 732  232
             if (bytesRead > 0) {
 733  232
                 is.reset();
 734  232
                 final BufferedReader bReader = new BufferedReader(new StringReader(
 735  
                         xmlProlog.substring(0, firstGT + 1)));
 736  232
                 final StringBuffer prolog = new StringBuffer();
 737  232
                 String line = bReader.readLine();
 738  496
                 while (line != null) {
 739  264
                     prolog.append(line);
 740  264
                     line = bReader.readLine();
 741  
                 }
 742  232
                 final Matcher m = ENCODING_PATTERN.matcher(prolog);
 743  232
                 if (m.find()) {
 744  182
                     encoding = m.group(1).toUpperCase();
 745  182
                     encoding = encoding.substring(1, encoding.length() - 1);
 746  
                 }
 747  
             }
 748  
         }
 749  502
         return encoding;
 750  
     }
 751  
 
 752  
     /**
 753  
      * Indicates if the MIME type belongs to the APPLICATION XML family.
 754  
      *
 755  
      * @param mime The mime type
 756  
      * @return true if the mime type belongs to the APPLICATION XML family,
 757  
      * otherwise false
 758  
      */
 759  
     static boolean isAppXml(final String mime) {
 760  444
         return mime != null &&
 761  
                (mime.equals("application/xml") ||
 762  
                 mime.equals("application/xml-dtd") ||
 763  
                 mime.equals("application/xml-external-parsed-entity") ||
 764  
                mime.startsWith("application/") && mime.endsWith("+xml"));
 765  
     }
 766  
 
 767  
     /**
 768  
      * Indicates if the MIME type belongs to the TEXT XML family.
 769  
      *
 770  
      * @param mime The mime type
 771  
      * @return true if the mime type belongs to the TEXT XML family,
 772  
      * otherwise false
 773  
      */
 774  
     static boolean isTextXml(final String mime) {
 775  432
         return mime != null &&
 776  
               (mime.equals("text/xml") ||
 777  
                mime.equals("text/xml-external-parsed-entity") ||
 778  
               mime.startsWith("text/") && mime.endsWith("+xml"));
 779  
     }
 780  
 
 781  
     private static final String RAW_EX_1 =
 782  
         "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
 783  
 
 784  
     private static final String RAW_EX_2 =
 785  
         "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";
 786  
 
 787  
     private static final String HTTP_EX_1 =
 788  
         "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL";
 789  
 
 790  
     private static final String HTTP_EX_2 =
 791  
         "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";
 792  
 
 793  
     private static final String HTTP_EX_3 =
 794  
         "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME";
 795  
 
 796  
 }