View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.io.input.compatibility;
18  
19  import java.io.BufferedInputStream;
20  import java.io.BufferedReader;
21  import java.io.File;
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.io.InputStreamReader;
25  import java.io.Reader;
26  import java.io.StringReader;
27  import java.net.HttpURLConnection;
28  import java.net.URL;
29  import java.net.URLConnection;
30  import java.nio.charset.StandardCharsets;
31  import java.nio.file.Files;
32  import java.text.MessageFormat;
33  import java.util.Locale;
34  import java.util.regex.Matcher;
35  import java.util.regex.Pattern;
36  
37  import org.apache.commons.io.IOUtils;
38  import org.apache.commons.io.output.XmlStreamWriter;
39  
40  /**
41   * Character stream that handles all the necessary Voodoo to figure out the
42   * charset encoding of the XML document within the stream.
43   * <p>
44   * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader.
45   * This one IS a character stream.
46   * </p>
47   * <p>
48   * All this has to be done without consuming characters from the stream, if not
49   * the XML parser will not recognized the document as a valid XML. This is not
50   * 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers
51   * right now, XmlStreamReader handles it and things work in all parsers).
52   * </p>
53   * <p>
54   * The XmlStreamReader class handles the charset encoding of XML documents in
55   * Files, raw streams and HTTP streams by offering a wide set of constructors.
56   * </p>
57   * <p>
58   * By default the charset encoding detection is lenient, the constructor with
59   * the lenient flag can be used for a script (following HTTP MIME and XML
60   * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a
61   * href="https://diveintomark.org/archives/2004/02/13/xml-media-types">
62   * Determining the character encoding of a feed</a>.
63   * </p>
64   * <p>
65   * Originally developed for <a href="https://rome.dev.java.net">ROME</a> under
66   * Apache License 2.0.
67   * </p>
68   *
69   * @see XmlStreamWriter
70   */
71  public class XmlStreamReader extends Reader {
72  
73      private static final String UTF_8 = StandardCharsets.UTF_8.name();
74  
75      private static final String US_ASCII = StandardCharsets.US_ASCII.name();
76  
77      private static final String UTF_16BE = StandardCharsets.UTF_16BE.name();
78  
79      private static final String UTF_16LE = StandardCharsets.UTF_16LE.name();
80  
81      private static final String UTF_16 = StandardCharsets.UTF_16.name();
82  
83      private static final String UTF_32BE = "UTF-32BE";
84  
85      private static final String UTF_32LE = "UTF-32LE";
86  
87      private static final String UTF_32 = "UTF-32";
88  
89      private static final String EBCDIC = "CP1047";
90  
91      private static String staticDefaultEncoding;
92  
93      private static final Pattern CHARSET_PATTERN = Pattern
94              .compile("charset=[\"']?([.[^; \"']]*)[\"']?");
95  
96      public static final Pattern ENCODING_PATTERN = Pattern.compile(
97              "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))",
98              Pattern.MULTILINE);
99  
100     private static final MessageFormat RAW_EX_1 = new MessageFormat(
101             "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
102 
103     private static final MessageFormat RAW_EX_2 = new MessageFormat(
104             "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");
105 
106     private static final MessageFormat HTTP_EX_1 = new MessageFormat(
107             "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be null");
108 
109     private static final MessageFormat HTTP_EX_2 = new MessageFormat(
110             "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch");
111 
112     private static final MessageFormat HTTP_EX_3 = new MessageFormat(
113             "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Illegal MIME");
114 
115     // returns the BOM in the stream, null if not present,
116     // if there was BOM the in the stream it is consumed
117     static String getBOMEncoding(final BufferedInputStream is)
118             throws IOException {
119         String encoding = null;
120         final int[] bytes = new int[3];
121         is.mark(3);
122         bytes[0] = is.read();
123         bytes[1] = is.read();
124         bytes[2] = is.read();
125 
126         if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
127             encoding = UTF_16BE;
128             is.reset();
129             is.read();
130             is.read();
131         } else if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
132             encoding = UTF_16LE;
133             is.reset();
134             is.read();
135             is.read();
136         } else if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
137             encoding = UTF_8;
138         } else {
139             is.reset();
140         }
141         return encoding;
142     }
143 
144     // returns charset parameter value, null if not present, null if
145     // httpContentType is null
146     static String getContentTypeEncoding(final String httpContentType) {
147         String encoding = null;
148         if (httpContentType != null) {
149             final int i = httpContentType.indexOf(";");
150             if (i > -1) {
151                 final String postMime = httpContentType.substring(i + 1);
152                 final Matcher m = CHARSET_PATTERN.matcher(postMime);
153                 encoding = m.find() ? m.group(1) : null;
154                 encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null;
155             }
156         }
157         return encoding;
158     }
159 
160     // returns MIME type or null if httpContentType is null
161     static String getContentTypeMime(final String httpContentType) {
162         String mime = null;
163         if (httpContentType != null) {
164             final int i = httpContentType.indexOf(";");
165             mime = (i == -1 ? httpContentType : httpContentType.substring(0,
166                     i)).trim();
167         }
168         return mime;
169     }
170 
171     /**
172      * Returns the default encoding to use if none is set in HTTP content-type,
173      * XML prolog and the rules based on content-type are not adequate.
174      * <p>
175      * If it is null the content-type based rules are used.
176      *
177      * @return the default encoding to use.
178      */
179     public static String getDefaultEncoding() {
180         return staticDefaultEncoding;
181     }
182 
183     // returns the best guess for the encoding by looking the first bytes of the
184     // stream, '<?'
185     private static String getXMLGuessEncoding(final BufferedInputStream is)
186             throws IOException {
187         String encoding = null;
188         final int[] bytes = new int[4];
189         is.mark(4);
190         bytes[0] = is.read();
191         bytes[1] = is.read();
192         bytes[2] = is.read();
193         bytes[3] = is.read();
194         is.reset();
195 
196         if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00
197                 && bytes[3] == 0x3F) {
198             encoding = UTF_16BE;
199         } else if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F
200                 && bytes[3] == 0x00) {
201             encoding = UTF_16LE;
202         } else if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78
203                 && bytes[3] == 0x6D) {
204             encoding = UTF_8;
205         } else if (bytes[0] == 0x4C && bytes[1] == 0x6F && bytes[2] == 0xA7
206                 && bytes[3] == 0x94) {
207             encoding = EBCDIC;
208         }
209         return encoding;
210     }
211 
212     // returns the encoding declared in the <?xml encoding=...?>, null if none
213     private static String getXmlProlog(final BufferedInputStream is, final String guessedEnc)
214             throws IOException {
215         String encoding = null;
216         if (guessedEnc != null) {
217             final byte[] bytes = IOUtils.byteArray();
218             is.mark(IOUtils.DEFAULT_BUFFER_SIZE);
219             int offset = 0;
220             int max = IOUtils.DEFAULT_BUFFER_SIZE;
221             int c = is.read(bytes, offset, max);
222             int firstGT = -1;
223             String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning)
224             while (c != -1 && firstGT == -1 && offset < IOUtils.DEFAULT_BUFFER_SIZE) {
225                 offset += c;
226                 max -= c;
227                 c = is.read(bytes, offset, max);
228                 xmlProlog = new String(bytes, 0, offset, guessedEnc);
229                 firstGT = xmlProlog.indexOf('>');
230             }
231             if (firstGT == -1) {
232                 if (c == -1) {
233                     throw new IOException("Unexpected end of XML stream");
234                 }
235                 throw new IOException(
236                         "XML prolog or ROOT element not found on first "
237                                 + offset + " bytes");
238             }
239             final int bytesRead = offset;
240             if (bytesRead > 0) {
241                 is.reset();
242                 final BufferedReader bReader = new BufferedReader(new StringReader(
243                         xmlProlog.substring(0, firstGT + 1)));
244                 final StringBuilder prolog = new StringBuilder();
245                 String line;
246                 while ((line = bReader.readLine()) != null) {
247                     prolog.append(line);
248                 }
249                 final Matcher m = ENCODING_PATTERN.matcher(prolog);
250                 if (m.find()) {
251                     encoding = m.group(1).toUpperCase(Locale.ROOT);
252                     encoding = encoding.substring(1, encoding.length() - 1);
253                 }
254             }
255         }
256         return encoding;
257     }
258 
259     // indicates if the MIME type belongs to the APPLICATION XML family
260     static boolean isAppXml(final String mime) {
261         return mime != null
262                 && (mime.equals("application/xml")
263                         || mime.equals("application/xml-dtd")
264                         || mime
265                                 .equals("application/xml-external-parsed-entity") || mime
266                         .startsWith("application/") && mime.endsWith("+xml"));
267     }
268 
269     // indicates if the MIME type belongs to the TEXT XML family
270     static boolean isTextXml(final String mime) {
271         return mime != null
272                 && (mime.equals("text/xml")
273                         || mime.equals("text/xml-external-parsed-entity") || mime
274                         .startsWith("text/") && mime.endsWith("+xml"));
275     }
276 
277     /**
278      * Sets the default encoding to use if none is set in HTTP content-type, XML
279      * prolog and the rules based on content-type are not adequate.
280      * <p>
281      * If it is set to null the content-type based rules are used.
282      * <p>
283      * By default it is null.
284      *
285      * @param encoding charset encoding to default to.
286      */
287     public static void setDefaultEncoding(final String encoding) {
288         staticDefaultEncoding = encoding;
289     }
290 
291     private Reader reader;
292 
293     private String encoding;
294 
295     private final String defaultEncoding;
296 
297     /**
298      * Creates a Reader for a File.
299      * <p>
300      * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset,
301      * if this is also missing defaults to UTF-8.
302      * <p>
303      * It does a lenient charset encoding detection, check the constructor with
304      * the lenient parameter for details.
305      *
306      * @param file File to create a Reader from.
307      * @throws IOException thrown if there is a problem reading the file.
308      */
309     @SuppressWarnings("resource") // FileInputStream is closed when this closed when this object is closed.
310     public XmlStreamReader(final File file) throws IOException {
311         this(Files.newInputStream(file.toPath()));
312     }
313 
314     /**
315      * Creates a Reader for a raw InputStream.
316      * <p>
317      * It follows the same logic used for files.
318      * <p>
319      * It does a lenient charset encoding detection, check the constructor with
320      * the lenient parameter for details.
321      *
322      * @param inputStream InputStream to create a Reader from.
323      * @throws IOException thrown if there is a problem reading the stream.
324      */
325     public XmlStreamReader(final InputStream inputStream) throws IOException {
326         this(inputStream, true);
327     }
328 
329     /**
330      * Creates a Reader for a raw InputStream.
331      * <p>
332      * It follows the same logic used for files.
333      * <p>
334      * If lenient detection is indicated and the detection above fails as per
335      * specifications it then attempts the following:
336      * <p>
337      * If the content type was 'text/html' it replaces it with 'text/xml' and
338      * tries the detection again.
339      * <p>
340      * Else if the XML prolog had a charset encoding that encoding is used.
341      * <p>
342      * Else if the content type had a charset encoding that encoding is used.
343      * <p>
344      * Else 'UTF-8' is used.
345      * <p>
346      * If lenient detection is indicated an XmlStreamReaderException is never
347      * thrown.
348      *
349      * @param inputStream InputStream to create a Reader from.
350      * @param lenient indicates if the charset encoding detection should be
351      *        relaxed.
352      * @throws IOException thrown if there is a problem reading the stream.
353      * @throws XmlStreamReaderException thrown if the charset encoding could not
354      *         be determined according to the specification.
355      */
356     public XmlStreamReader(final InputStream inputStream, final boolean lenient) throws IOException,
357             XmlStreamReaderException {
358         defaultEncoding = staticDefaultEncoding;
359         try {
360             doRawStream(inputStream);
361         } catch (final XmlStreamReaderException ex) {
362             if (!lenient) {
363                 throw ex;
364             }
365             doLenientDetection(null, ex);
366         }
367     }
368 
369     /**
370      * Creates a Reader using an InputStream and the associated content-type
371      * header.
372      * <p>
373      * First it checks if the stream has BOM. If there is not BOM checks the
374      * content-type encoding. If there is not content-type encoding checks the
375      * XML prolog encoding. If there is not XML prolog encoding uses the default
376      * encoding mandated by the content-type MIME type.
377      * <p>
378      * It does a lenient charset encoding detection, check the constructor with
379      * the lenient parameter for details.
380      *
381      * @param inputStream InputStream to create the reader from.
382      * @param httpContentType content-type header to use for the resolution of
383      *        the charset encoding.
384      * @throws IOException thrown if there is a problem reading the file.
385      */
386     public XmlStreamReader(final InputStream inputStream, final String httpContentType)
387             throws IOException {
388         this(inputStream, httpContentType, true);
389     }
390 
391     /**
392      * Creates a Reader using an InputStream and the associated content-type
393      * header. This constructor is lenient regarding the encoding detection.
394      * <p>
395      * First it checks if the stream has BOM. If there is not BOM checks the
396      * content-type encoding. If there is not content-type encoding checks the
397      * XML prolog encoding. If there is not XML prolog encoding uses the default
398      * encoding mandated by the content-type MIME type.
399      * <p>
400      * If lenient detection is indicated and the detection above fails as per
401      * specifications it then attempts the following:
402      * <p>
403      * If the content type was 'text/html' it replaces it with 'text/xml' and
404      * tries the detection again.
405      * <p>
406      * Else if the XML prolog had a charset encoding that encoding is used.
407      * <p>
408      * Else if the content type had a charset encoding that encoding is used.
409      * <p>
410      * Else 'UTF-8' is used.
411      * <p>
412      * If lenient detection is indicated an XmlStreamReaderException is never
413      * thrown.
414      *
415      * @param inputStream InputStream to create the reader from.
416      * @param httpContentType content-type header to use for the resolution of
417      *        the charset encoding.
418      * @param lenient indicates if the charset encoding detection should be
419      *        relaxed.
420      * @throws IOException thrown if there is a problem reading the file.
421      * @throws XmlStreamReaderException thrown if the charset encoding could not
422      *         be determined according to the specification.
423      */
424     public XmlStreamReader(final InputStream inputStream, final String httpContentType,
425             final boolean lenient) throws IOException, XmlStreamReaderException {
426         this(inputStream, httpContentType, lenient, null);
427     }
428 
429     /**
430      * Creates a Reader using an InputStream and the associated content-type
431      * header. This constructor is lenient regarding the encoding detection.
432      * <p>
433      * First it checks if the stream has BOM. If there is not BOM checks the
434      * content-type encoding. If there is not content-type encoding checks the
435      * XML prolog encoding. If there is not XML prolog encoding uses the default
436      * encoding mandated by the content-type MIME type.
437      * <p>
438      * If lenient detection is indicated and the detection above fails as per
439      * specifications it then attempts the following:
440      * <p>
441      * If the content type was 'text/html' it replaces it with 'text/xml' and
442      * tries the detection again.
443      * <p>
444      * Else if the XML prolog had a charset encoding that encoding is used.
445      * <p>
446      * Else if the content type had a charset encoding that encoding is used.
447      * <p>
448      * Else 'UTF-8' is used.
449      * <p>
450      * If lenient detection is indicated an XmlStreamReaderException is never
451      * thrown.
452      *
453      * @param inputStream InputStream to create the reader from.
454      * @param httpContentType content-type header to use for the resolution of
455      *        the charset encoding.
456      * @param lenient indicates if the charset encoding detection should be
457      *        relaxed.
458      * @param defaultEncoding the default encoding to use
459      * @throws IOException thrown if there is a problem reading the file.
460      * @throws XmlStreamReaderException thrown if the charset encoding could not
461      *         be determined according to the specification.
462      */
463     public XmlStreamReader(final InputStream inputStream, final String httpContentType,
464             final boolean lenient, final String defaultEncoding) throws IOException,
465             XmlStreamReaderException {
466         this.defaultEncoding = defaultEncoding == null ? staticDefaultEncoding
467                 : defaultEncoding;
468         try {
469             doHttpStream(inputStream, httpContentType, lenient);
470         } catch (final XmlStreamReaderException ex) {
471             if (!lenient) {
472                 throw ex;
473             }
474             doLenientDetection(httpContentType, ex);
475         }
476     }
477 
478     /**
479      * Creates a Reader using the InputStream of a URL.
480      * <p>
481      * If the URL is not of type HTTP and there is not 'content-type' header in
482      * the fetched data it uses the same logic used for Files.
483      * <p>
484      * If the URL is a HTTP Url or there is a 'content-type' header in the
485      * fetched data it uses the same logic used for an InputStream with
486      * content-type.
487      * <p>
488      * It does a lenient charset encoding detection, check the constructor with
489      * the lenient parameter for details.
490      *
491      * @param url URL to create a Reader from.
492      * @throws IOException thrown if there is a problem reading the stream of
493      *         the URL.
494      */
495     public XmlStreamReader(final URL url) throws IOException {
496         // TODO URLConnection leak.
497         this(url.openConnection());
498     }
499 
500     /**
501      * Creates a Reader using the InputStream of a URLConnection.
502      * <p>
503      * If the URLConnection is not of type HttpURLConnection and there is not
504      * 'content-type' header in the fetched data it uses the same logic used for
505      * files.
506      * <p>
507      * If the URLConnection is a HTTP Url or there is a 'content-type' header in
508      * the fetched data it uses the same logic used for an InputStream with
509      * content-type.
510      * <p>
511      * It does a lenient charset encoding detection, check the constructor with
512      * the lenient parameter for details.
513      *
514      * @param conn URLConnection to create a Reader from.
515      * @throws IOException thrown if there is a problem reading the stream of
516      *         the URLConnection.
517      */
518     public XmlStreamReader(final URLConnection conn) throws IOException {
519         defaultEncoding = staticDefaultEncoding;
520         final boolean lenient = true;
521         if (conn instanceof HttpURLConnection || conn.getContentType() != null) {
522             try {
523                 doHttpStream(conn.getInputStream(), conn.getContentType(),
524                         lenient);
525             } catch (final XmlStreamReaderException ex) {
526                 doLenientDetection(conn.getContentType(), ex);
527             }
528         } else {
529             try {
530                 doRawStream(conn.getInputStream());
531             } catch (final XmlStreamReaderException ex) {
532                 doLenientDetection(null, ex);
533             }
534         }
535     }
536 
537     // InputStream is passed for XmlStreamReaderException creation only
538     String calculateHttpEncoding(final String cTMime, final String cTEnc,
539             final String bomEnc, final String xmlGuessEnc, final String xmlEnc, final InputStream is,
540             final boolean lenient) throws IOException {
541         final String encoding;
542         if (lenient && xmlEnc != null) {
543             encoding = xmlEnc;
544         } else {
545             final boolean appXml = isAppXml(cTMime);
546             final boolean textXml = isTextXml(cTMime);
547             if (!appXml && !textXml) {
548                 throw new XmlStreamReaderException(HTTP_EX_3
549                         .format(new Object[] { cTMime, cTEnc, bomEnc,
550                                 xmlGuessEnc, xmlEnc }), cTMime, cTEnc, bomEnc,
551                         xmlGuessEnc, xmlEnc, is);
552             }
553             if (cTEnc == null) {
554                 if (appXml) {
555                     encoding = calculateRawEncoding(bomEnc, xmlGuessEnc,
556                             xmlEnc, is);
557                 } else {
558                     encoding = defaultEncoding == null ? US_ASCII
559                             : defaultEncoding;
560                 }
561             } else if (bomEnc != null
562                     && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) {
563                 throw new XmlStreamReaderException(HTTP_EX_1
564                         .format(new Object[] { cTMime, cTEnc, bomEnc,
565                                 xmlGuessEnc, xmlEnc }), cTMime, cTEnc,
566                         bomEnc, xmlGuessEnc, xmlEnc, is);
567             } else if (cTEnc.equals(UTF_16)) {
568                 if (bomEnc == null || !bomEnc.startsWith(UTF_16)) {
569                     throw new XmlStreamReaderException(HTTP_EX_2
570                             .format(new Object[] { cTMime, cTEnc, bomEnc,
571                                     xmlGuessEnc, xmlEnc }), cTMime, cTEnc,
572                             bomEnc, xmlGuessEnc, xmlEnc, is);
573                 }
574                 encoding = bomEnc;
575             } else if (bomEnc != null
576                     && (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE))) {
577                 throw new XmlStreamReaderException(HTTP_EX_1
578                         .format(new Object[] { cTMime, cTEnc, bomEnc,
579                                 xmlGuessEnc, xmlEnc }), cTMime, cTEnc,
580                         bomEnc, xmlGuessEnc, xmlEnc, is);
581             } else if (cTEnc.equals(UTF_32)) {
582                 if (bomEnc == null || !bomEnc.startsWith(UTF_32)) {
583                     throw new XmlStreamReaderException(HTTP_EX_2
584                             .format(new Object[] { cTMime, cTEnc, bomEnc,
585                                     xmlGuessEnc, xmlEnc }), cTMime, cTEnc,
586                             bomEnc, xmlGuessEnc, xmlEnc, is);
587                 }
588                 encoding = bomEnc;
589             } else {
590                 encoding = cTEnc;
591             }
592         }
593         return encoding;
594     }
595 
596     // InputStream is passed for XmlStreamReaderException creation only
597     String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc,
598             final String xmlEnc, final InputStream is) throws IOException {
599         final String encoding;
600         if (bomEnc == null) {
601             if (xmlGuessEnc == null || xmlEnc == null) {
602                 encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
603             } else if (xmlEnc.equals(UTF_16)
604                     && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc
605                             .equals(UTF_16LE))) {
606                 encoding = xmlGuessEnc;
607             } else if (xmlEnc.equals(UTF_32)
608                     && (xmlGuessEnc.equals(UTF_32BE) || xmlGuessEnc
609                             .equals(UTF_32LE))) {
610                 encoding = xmlGuessEnc;
611             } else {
612                 encoding = xmlEnc;
613             }
614         } else if (bomEnc.equals(UTF_8)) {
615             if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
616                 throw new XmlStreamReaderException(RAW_EX_1
617                         .format(new Object[] { bomEnc, xmlGuessEnc, xmlEnc }),
618                         bomEnc, xmlGuessEnc, xmlEnc, is);
619             }
620             if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
621                 throw new XmlStreamReaderException(RAW_EX_1
622                         .format(new Object[] { bomEnc, xmlGuessEnc, xmlEnc }),
623                         bomEnc, xmlGuessEnc, xmlEnc, is);
624             }
625             encoding = UTF_8;
626         } else {
627             if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
628                 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
629                     throw new XmlStreamReaderException(RAW_EX_1.format(new Object[] { bomEnc,
630                             xmlGuessEnc, xmlEnc }), bomEnc, xmlGuessEnc, xmlEnc, is);
631                 }
632                 if (xmlEnc != null && !xmlEnc.equals(UTF_16)
633                         && !xmlEnc.equals(bomEnc)) {
634                     throw new XmlStreamReaderException(RAW_EX_1
635                             .format(new Object[] { bomEnc, xmlGuessEnc, xmlEnc }),
636                             bomEnc, xmlGuessEnc, xmlEnc, is);
637                 }
638             } else if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) {
639                 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
640                     throw new XmlStreamReaderException(RAW_EX_1.format(new Object[] { bomEnc,
641                             xmlGuessEnc, xmlEnc }), bomEnc, xmlGuessEnc, xmlEnc, is);
642                 }
643                 if (xmlEnc != null && !xmlEnc.equals(UTF_32)
644                         && !xmlEnc.equals(bomEnc)) {
645                     throw new XmlStreamReaderException(RAW_EX_1
646                             .format(new Object[] { bomEnc, xmlGuessEnc, xmlEnc }),
647                             bomEnc, xmlGuessEnc, xmlEnc, is);
648                 }
649             } else {
650                 throw new XmlStreamReaderException(RAW_EX_2.format(new Object[] {
651                         bomEnc, xmlGuessEnc, xmlEnc }), bomEnc, xmlGuessEnc,
652                         xmlEnc, is);
653             }
654             encoding = bomEnc;
655         }
656         return encoding;
657     }
658 
659     /**
660      * Closes the XmlStreamReader stream.
661      *
662      * @throws IOException thrown if there was a problem closing the stream.
663      */
664     @Override
665     public void close() throws IOException {
666         reader.close();
667     }
668 
669     private void doHttpStream(final InputStream inputStream, final String httpContentType,
670             final boolean lenient) throws IOException {
671         final BufferedInputStream pis = new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE);
672         final String cTMime = getContentTypeMime(httpContentType);
673         final String cTEnc = getContentTypeEncoding(httpContentType);
674         final String bomEnc = getBOMEncoding(pis);
675         final String xmlGuessEnc = getXMLGuessEncoding(pis);
676         final String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
677         final String encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc,
678                 xmlGuessEnc, xmlEnc, pis, lenient);
679         prepareReader(pis, encoding);
680     }
681 
682     private void doLenientDetection(String httpContentType,
683             XmlStreamReaderException ex) throws IOException {
684         if (httpContentType != null && httpContentType.startsWith("text/html")) {
685             httpContentType = httpContentType.substring("text/html"
686                     .length());
687             httpContentType = "text/xml" + httpContentType;
688             try {
689                 doHttpStream(ex.getInputStream(), httpContentType, true);
690                 ex = null;
691             } catch (final XmlStreamReaderException ex2) {
692                 ex = ex2;
693             }
694         }
695         if (ex != null) {
696             String encoding = ex.getXmlEncoding();
697             if (encoding == null) {
698                 encoding = ex.getContentTypeEncoding();
699             }
700             if (encoding == null) {
701                 encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
702             }
703             prepareReader(ex.getInputStream(), encoding);
704         }
705     }
706 
707     private void doRawStream(final InputStream inputStream)
708             throws IOException {
709         final BufferedInputStream pis = new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE);
710         final String bomEnc = getBOMEncoding(pis);
711         final String xmlGuessEnc = getXMLGuessEncoding(pis);
712         final String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
713         final String encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, pis);
714         prepareReader(pis, encoding);
715     }
716 
717     /**
718      * Returns the charset encoding of the XmlStreamReader.
719      *
720      * @return charset encoding.
721      */
722     public String getEncoding() {
723         return encoding;
724     }
725 
726     private void prepareReader(final InputStream inputStream, final String encoding)
727             throws IOException {
728         reader = new InputStreamReader(inputStream, encoding);
729         this.encoding = encoding;
730     }
731 
732     @Override
733     public int read(final char[] buf, final int offset, final int len) throws IOException {
734         return reader.read(buf, offset, len);
735     }
736 
737 }