001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import java.io.BufferedInputStream;
020import java.io.BufferedReader;
021import java.io.File;
022import java.io.FileInputStream;
023import java.io.IOException;
024import java.io.InputStream;
025import java.io.InputStreamReader;
026import java.io.Reader;
027import java.io.StringReader;
028import java.net.HttpURLConnection;
029import java.net.URL;
030import java.net.URLConnection;
031import java.text.MessageFormat;
032import java.util.Locale;
033import java.util.regex.Matcher;
034import java.util.regex.Pattern;
035
036import org.apache.commons.io.ByteOrderMark;
037
038/**
039 * Character stream that handles all the necessary Voodo to figure out the
040 * charset encoding of the XML document within the stream.
041 * <p>
042 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader.
043 * This one IS a character stream.
044 * <p>
045 * All this has to be done without consuming characters from the stream, if not
046 * the XML parser will not recognized the document as a valid XML. This is not
047 * 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers
048 * right now, XmlStreamReader handles it and things work in all parsers).
049 * <p>
050 * The XmlStreamReader class handles the charset encoding of XML documents in
051 * Files, raw streams and HTTP streams by offering a wide set of constructors.
052 * <p>
053 * By default the charset encoding detection is lenient, the constructor with
054 * the lenient flag can be used for an script (following HTTP MIME and XML
055 * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a
056 * href="http://diveintomark.org/archives/2004/02/13/xml-media-types">
057 * Determining the character encoding of a feed</a>.
058 * <p>
059 * Originally developed for <a href="http://rome.dev.java.net">ROME</a> under
060 * Apache License 2.0.
061 *
062 * @version $Id: XmlStreamReader.java 1686747 2015-06-21 18:44:49Z krosenvold $
063 * @see org.apache.commons.io.output.XmlStreamWriter
064 * @since 2.0
065 */
066public class XmlStreamReader extends Reader {
067    private static final int BUFFER_SIZE = 4096;
068
069    private static final String UTF_8 = "UTF-8";
070
071    private static final String US_ASCII = "US-ASCII";
072
073    private static final String UTF_16BE = "UTF-16BE";
074
075    private static final String UTF_16LE = "UTF-16LE";
076
077    private static final String UTF_32BE = "UTF-32BE";
078
079    private static final String UTF_32LE = "UTF-32LE";
080
081    private static final String UTF_16 = "UTF-16";
082
083    private static final String UTF_32 = "UTF-32";
084
085    private static final String EBCDIC = "CP1047";
086
087    private static final ByteOrderMark[] BOMS = new ByteOrderMark[] {
088        ByteOrderMark.UTF_8,
089        ByteOrderMark.UTF_16BE,
090        ByteOrderMark.UTF_16LE,
091        ByteOrderMark.UTF_32BE,
092        ByteOrderMark.UTF_32LE
093    };
094
095    // UTF_16LE and UTF_32LE have the same two starting BOM bytes.
096    private static final ByteOrderMark[] XML_GUESS_BYTES = new ByteOrderMark[] {
097        new ByteOrderMark(UTF_8,    0x3C, 0x3F, 0x78, 0x6D),
098        new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F),
099        new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00),
100        new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C,
101                0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D),
102        new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00,
103                0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00),
104        new ByteOrderMark(EBCDIC,   0x4C, 0x6F, 0xA7, 0x94)
105    };
106
107    private final Reader reader;
108
109    private final String encoding;
110
111    private final String defaultEncoding;
112
113    /**
114     * Returns the default encoding to use if none is set in HTTP content-type,
115     * XML prolog and the rules based on content-type are not adequate.
116     * <p>
117     * If it is NULL the content-type based rules are used.
118     *
119     * @return the default encoding to use.
120     */
121    public String getDefaultEncoding() {
122        return defaultEncoding;
123    }
124
125    /**
126     * Creates a Reader for a File.
127     * <p>
128     * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset,
129     * if this is also missing defaults to UTF-8.
130     * <p>
131     * It does a lenient charset encoding detection, check the constructor with
132     * the lenient parameter for details.
133     *
134     * @param file File to create a Reader from.
135     * @throws IOException thrown if there is a problem reading the file.
136     */
137    public XmlStreamReader(final File file) throws IOException {
138        this(new FileInputStream(file));
139    }
140
141    /**
142     * Creates a Reader for a raw InputStream.
143     * <p>
144     * It follows the same logic used for files.
145     * <p>
146     * It does a lenient charset encoding detection, check the constructor with
147     * the lenient parameter for details.
148     *
149     * @param is InputStream to create a Reader from.
150     * @throws IOException thrown if there is a problem reading the stream.
151     */
152    public XmlStreamReader(final InputStream is) throws IOException {
153        this(is, true);
154    }
155
156    /**
157     * Creates a Reader for a raw InputStream.
158     * <p>
159     * It follows the same logic used for files.
160     * <p>
161     * If lenient detection is indicated and the detection above fails as per
162     * specifications it then attempts the following:
163     * <p>
164     * If the content type was 'text/html' it replaces it with 'text/xml' and
165     * tries the detection again.
166     * <p>
167     * Else if the XML prolog had a charset encoding that encoding is used.
168     * <p>
169     * Else if the content type had a charset encoding that encoding is used.
170     * <p>
171     * Else 'UTF-8' is used.
172     * <p>
173     * If lenient detection is indicated an XmlStreamReaderException is never
174     * thrown.
175     *
176     * @param is InputStream to create a Reader from.
177     * @param lenient indicates if the charset encoding detection should be
178     *        relaxed.
179     * @throws IOException thrown if there is a problem reading the stream.
180     * @throws XmlStreamReaderException thrown if the charset encoding could not
181     *         be determined according to the specs.
182     */
183    public XmlStreamReader(final InputStream is, final boolean lenient) throws IOException {
184        this(is, lenient, null);
185    }
186
187    /**
188     * Creates a Reader for a raw InputStream.
189     * <p>
190     * It follows the same logic used for files.
191     * <p>
192     * If lenient detection is indicated and the detection above fails as per
193     * specifications it then attempts the following:
194     * <p>
195     * If the content type was 'text/html' it replaces it with 'text/xml' and
196     * tries the detection again.
197     * <p>
198     * Else if the XML prolog had a charset encoding that encoding is used.
199     * <p>
200     * Else if the content type had a charset encoding that encoding is used.
201     * <p>
202     * Else 'UTF-8' is used.
203     * <p>
204     * If lenient detection is indicated an XmlStreamReaderException is never
205     * thrown.
206     *
207     * @param is InputStream to create a Reader from.
208     * @param lenient indicates if the charset encoding detection should be
209     *        relaxed.
210     * @param defaultEncoding The default encoding
211     * @throws IOException thrown if there is a problem reading the stream.
212     * @throws XmlStreamReaderException thrown if the charset encoding could not
213     *         be determined according to the specs.
214     */
215    public XmlStreamReader(final InputStream is, final boolean lenient, final String defaultEncoding)
216            throws IOException {
217        this.defaultEncoding = defaultEncoding;
218        final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS);
219        final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
220        this.encoding = doRawStream(bom, pis, lenient);
221        this.reader = new InputStreamReader(pis, encoding);
222    }
223
224    /**
225     * Creates a Reader using the InputStream of a URL.
226     * <p>
227     * If the URL is not of type HTTP and there is not 'content-type' header in
228     * the fetched data it uses the same logic used for Files.
229     * <p>
230     * If the URL is a HTTP Url or there is a 'content-type' header in the
231     * fetched data it uses the same logic used for an InputStream with
232     * content-type.
233     * <p>
234     * It does a lenient charset encoding detection, check the constructor with
235     * the lenient parameter for details.
236     *
237     * @param url URL to create a Reader from.
238     * @throws IOException thrown if there is a problem reading the stream of
239     *         the URL.
240     */
241    public XmlStreamReader(final URL url) throws IOException {
242        this(url.openConnection(), null);
243    }
244
245    /**
246     * Creates a Reader using the InputStream of a URLConnection.
247     * <p>
248     * If the URLConnection is not of type HttpURLConnection and there is not
249     * 'content-type' header in the fetched data it uses the same logic used for
250     * files.
251     * <p>
252     * If the URLConnection is a HTTP Url or there is a 'content-type' header in
253     * the fetched data it uses the same logic used for an InputStream with
254     * content-type.
255     * <p>
256     * It does a lenient charset encoding detection, check the constructor with
257     * the lenient parameter for details.
258     *
259     * @param conn URLConnection to create a Reader from.
260     * @param defaultEncoding The default encoding
261     * @throws IOException thrown if there is a problem reading the stream of
262     *         the URLConnection.
263     */
264    public XmlStreamReader(final URLConnection conn, final String defaultEncoding) throws IOException {
265        this.defaultEncoding = defaultEncoding;
266        final boolean lenient = true;
267        final String contentType = conn.getContentType();
268        final InputStream is = conn.getInputStream();
269        final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS);
270        final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
271        if (conn instanceof HttpURLConnection || contentType != null) {
272            this.encoding = doHttpStream(bom, pis, contentType, lenient);
273        } else {
274            this.encoding = doRawStream(bom, pis, lenient);
275        }
276        this.reader = new InputStreamReader(pis, encoding);
277    }
278
279    /**
280     * Creates a Reader using an InputStream an the associated content-type
281     * header.
282     * <p>
283     * First it checks if the stream has BOM. If there is not BOM checks the
284     * content-type encoding. If there is not content-type encoding checks the
285     * XML prolog encoding. If there is not XML prolog encoding uses the default
286     * encoding mandated by the content-type MIME type.
287     * <p>
288     * It does a lenient charset encoding detection, check the constructor with
289     * the lenient parameter for details.
290     *
291     * @param is InputStream to create the reader from.
292     * @param httpContentType content-type header to use for the resolution of
293     *        the charset encoding.
294     * @throws IOException thrown if there is a problem reading the file.
295     */
296    public XmlStreamReader(final InputStream is, final String httpContentType)
297            throws IOException {
298        this(is, httpContentType, true);
299    }
300
301    /**
302     * Creates a Reader using an InputStream an the associated content-type
303     * header. This constructor is lenient regarding the encoding detection.
304     * <p>
305     * First it checks if the stream has BOM. If there is not BOM checks the
306     * content-type encoding. If there is not content-type encoding checks the
307     * XML prolog encoding. If there is not XML prolog encoding uses the default
308     * encoding mandated by the content-type MIME type.
309     * <p>
310     * If lenient detection is indicated and the detection above fails as per
311     * specifications it then attempts the following:
312     * <p>
313     * If the content type was 'text/html' it replaces it with 'text/xml' and
314     * tries the detection again.
315     * <p>
316     * Else if the XML prolog had a charset encoding that encoding is used.
317     * <p>
318     * Else if the content type had a charset encoding that encoding is used.
319     * <p>
320     * Else 'UTF-8' is used.
321     * <p>
322     * If lenient detection is indicated an XmlStreamReaderException is never
323     * thrown.
324     *
325     * @param is InputStream to create the reader from.
326     * @param httpContentType content-type header to use for the resolution of
327     *        the charset encoding.
328     * @param lenient indicates if the charset encoding detection should be
329     *        relaxed.
330     * @param defaultEncoding The default encoding
331     * @throws IOException thrown if there is a problem reading the file.
332     * @throws XmlStreamReaderException thrown if the charset encoding could not
333     *         be determined according to the specs.
334     */
335    public XmlStreamReader(final InputStream is, final String httpContentType,
336            final boolean lenient, final String defaultEncoding) throws IOException {
337        this.defaultEncoding = defaultEncoding;
338        final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS);
339        final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
340        this.encoding = doHttpStream(bom, pis, httpContentType, lenient);
341        this.reader = new InputStreamReader(pis, encoding);
342    }
343
344    /**
345     * Creates a Reader using an InputStream an the associated content-type
346     * header. This constructor is lenient regarding the encoding detection.
347     * <p>
348     * First it checks if the stream has BOM. If there is not BOM checks the
349     * content-type encoding. If there is not content-type encoding checks the
350     * XML prolog encoding. If there is not XML prolog encoding uses the default
351     * encoding mandated by the content-type MIME type.
352     * <p>
353     * If lenient detection is indicated and the detection above fails as per
354     * specifications it then attempts the following:
355     * <p>
356     * If the content type was 'text/html' it replaces it with 'text/xml' and
357     * tries the detection again.
358     * <p>
359     * Else if the XML prolog had a charset encoding that encoding is used.
360     * <p>
361     * Else if the content type had a charset encoding that encoding is used.
362     * <p>
363     * Else 'UTF-8' is used.
364     * <p>
365     * If lenient detection is indicated an XmlStreamReaderException is never
366     * thrown.
367     *
368     * @param is InputStream to create the reader from.
369     * @param httpContentType content-type header to use for the resolution of
370     *        the charset encoding.
371     * @param lenient indicates if the charset encoding detection should be
372     *        relaxed.
373     * @throws IOException thrown if there is a problem reading the file.
374     * @throws XmlStreamReaderException thrown if the charset encoding could not
375     *         be determined according to the specs.
376     */
377    public XmlStreamReader(final InputStream is, final String httpContentType,
378            final boolean lenient) throws IOException {
379        this(is, httpContentType, lenient, null);
380    }
381
382    /**
383     * Returns the charset encoding of the XmlStreamReader.
384     *
385     * @return charset encoding.
386     */
387    public String getEncoding() {
388        return encoding;
389    }
390
391    /**
392     * Invokes the underlying reader's <code>read(char[], int, int)</code> method.
393     * @param buf the buffer to read the characters into
394     * @param offset The start offset
395     * @param len The number of bytes to read
396     * @return the number of characters read or -1 if the end of stream
397     * @throws IOException if an I/O error occurs
398     */
399    @Override
400    public int read(final char[] buf, final int offset, final int len) throws IOException {
401        return reader.read(buf, offset, len);
402    }
403
404    /**
405     * Closes the XmlStreamReader stream.
406     *
407     * @throws IOException thrown if there was a problem closing the stream.
408     */
409    @Override
410    public void close() throws IOException {
411        reader.close();
412    }
413
414    /**
415     * Process the raw stream.
416     *
417     * @param bom BOMInputStream to detect byte order marks
418     * @param pis BOMInputStream to guess XML encoding
419     * @param lenient indicates if the charset encoding detection should be
420     *        relaxed.
421     * @return the encoding to be used
422     * @throws IOException thrown if there is a problem reading the stream.
423     */
424    private String doRawStream(final BOMInputStream bom, final BOMInputStream pis, final boolean lenient)
425            throws IOException {
426        final String bomEnc      = bom.getBOMCharsetName();
427        final String xmlGuessEnc = pis.getBOMCharsetName();
428        final String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
429        try {
430            return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
431        } catch (final XmlStreamReaderException ex) {
432            if (lenient) {
433                return doLenientDetection(null, ex);
434            } else {
435                throw ex;
436            }
437        }
438    }
439
440    /**
441     * Process a HTTP stream.
442     *
443     * @param bom BOMInputStream to detect byte order marks
444     * @param pis BOMInputStream to guess XML encoding
445     * @param httpContentType The HTTP content type
446     * @param lenient indicates if the charset encoding detection should be
447     *        relaxed.
448     * @return the encoding to be used
449     * @throws IOException thrown if there is a problem reading the stream.
450     */
451    private String doHttpStream(final BOMInputStream bom, final BOMInputStream pis, final String httpContentType,
452            final boolean lenient) throws IOException {
453        final String bomEnc      = bom.getBOMCharsetName();
454        final String xmlGuessEnc = pis.getBOMCharsetName();
455        final String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
456        try {
457            return calculateHttpEncoding(httpContentType, bomEnc,
458                    xmlGuessEnc, xmlEnc, lenient);
459        } catch (final XmlStreamReaderException ex) {
460            if (lenient) {
461                return doLenientDetection(httpContentType, ex);
462            } else {
463                throw ex;
464            }
465        }
466    }
467
468    /**
469     * Do lenient detection.
470     *
471     * @param httpContentType content-type header to use for the resolution of
472     *        the charset encoding.
473     * @param ex The thrown exception
474     * @return the encoding
475     * @throws IOException thrown if there is a problem reading the stream.
476     */
477    private String doLenientDetection(String httpContentType,
478            XmlStreamReaderException ex) throws IOException {
479        if (httpContentType != null && httpContentType.startsWith("text/html")) {
480            httpContentType = httpContentType.substring("text/html".length());
481            httpContentType = "text/xml" + httpContentType;
482            try {
483                return calculateHttpEncoding(httpContentType, ex.getBomEncoding(),
484                        ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true);
485            } catch (final XmlStreamReaderException ex2) {
486                ex = ex2;
487            }
488        }
489        String encoding = ex.getXmlEncoding();
490        if (encoding == null) {
491            encoding = ex.getContentTypeEncoding();
492        }
493        if (encoding == null) {
494            encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
495        }
496        return encoding;
497    }
498
499    /**
500     * Calculate the raw encoding.
501     *
502     * @param bomEnc BOM encoding
503     * @param xmlGuessEnc XML Guess encoding
504     * @param xmlEnc XML encoding
505     * @return the raw encoding
506     * @throws IOException thrown if there is a problem reading the stream.
507     */
508    String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc,
509            final String xmlEnc) throws IOException {
510
511        // BOM is Null
512        if (bomEnc == null) {
513            if (xmlGuessEnc == null || xmlEnc == null) {
514                return defaultEncoding == null ? UTF_8 : defaultEncoding;
515            }
516            if (xmlEnc.equals(UTF_16) &&
517               (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
518                return xmlGuessEnc;
519            }
520            return xmlEnc;
521        }
522
523        // BOM is UTF-8
524        if (bomEnc.equals(UTF_8)) {
525            if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
526                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
527                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
528            }
529            if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
530                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
531                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
532            }
533            return bomEnc;
534        }
535
536        // BOM is UTF-16BE or UTF-16LE
537        if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
538            if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
539                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
540                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
541            }
542            if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
543                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
544                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
545            }
546            return bomEnc;
547        }
548
549        // BOM is UTF-32BE or UTF-32LE
550        if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) {
551            if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
552                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
553                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
554            }
555            if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) {
556                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
557                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
558            }
559            return bomEnc;
560        }
561
562        // BOM is something else
563        final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc);
564        throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
565    }
566
567
568    /**
569     * Calculate the HTTP encoding.
570     *
571     * @param httpContentType The HTTP content type
572     * @param bomEnc BOM encoding
573     * @param xmlGuessEnc XML Guess encoding
574     * @param xmlEnc XML encoding
575     * @param lenient indicates if the charset encoding detection should be
576     *        relaxed.
577     * @return the HTTP encoding
578     * @throws IOException thrown if there is a problem reading the stream.
579     */
580    String calculateHttpEncoding(final String httpContentType,
581            final String bomEnc, final String xmlGuessEnc, final String xmlEnc,
582            final boolean lenient) throws IOException {
583
584        // Lenient and has XML encoding
585        if (lenient && xmlEnc != null) {
586            return xmlEnc;
587        }
588
589        // Determine mime/encoding content types from HTTP Content Type
590        final String cTMime = getContentTypeMime(httpContentType);
591        final String cTEnc  = getContentTypeEncoding(httpContentType);
592        final boolean appXml  = isAppXml(cTMime);
593        final boolean textXml = isTextXml(cTMime);
594
595        // Mime type NOT "application/xml" or "text/xml"
596        if (!appXml && !textXml) {
597            final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
598            throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
599        }
600
601        // No content type encoding
602        if (cTEnc == null) {
603            if (appXml) {
604                return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
605            } else {
606                return defaultEncoding == null ? US_ASCII : defaultEncoding;
607            }
608        }
609
610        // UTF-16BE or UTF-16LE content type encoding
611        if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
612            if (bomEnc != null) {
613                final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
614                throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
615            }
616            return cTEnc;
617        }
618
619        // UTF-16 content type encoding
620        if (cTEnc.equals(UTF_16)) {
621            if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
622                return bomEnc;
623            }
624            final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
625            throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
626        }
627
628        // UTF-32BE or UTF-132E content type encoding
629        if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) {
630            if (bomEnc != null) {
631                final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
632                throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
633            }
634            return cTEnc;
635        }
636
637        // UTF-32 content type encoding
638        if (cTEnc.equals(UTF_32)) {
639            if (bomEnc != null && bomEnc.startsWith(UTF_32)) {
640                return bomEnc;
641            }
642            final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
643            throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
644        }
645
646        return cTEnc;
647    }
648
649    /**
650     * Returns MIME type or NULL if httpContentType is NULL.
651     *
652     * @param httpContentType the HTTP content type
653     * @return The mime content type
654     */
655    static String getContentTypeMime(final String httpContentType) {
656        String mime = null;
657        if (httpContentType != null) {
658            final int i = httpContentType.indexOf(";");
659            if (i >= 0) {
660                mime = httpContentType.substring(0, i);
661            } else {
662                mime = httpContentType;
663            }
664            mime = mime.trim();
665        }
666        return mime;
667    }
668
669    private static final Pattern CHARSET_PATTERN = Pattern
670            .compile("charset=[\"']?([.[^; \"']]*)[\"']?");
671
672    /**
673     * Returns charset parameter value, NULL if not present, NULL if
674     * httpContentType is NULL.
675     *
676     * @param httpContentType the HTTP content type
677     * @return The content type encoding (upcased)
678     */
679    static String getContentTypeEncoding(final String httpContentType) {
680        String encoding = null;
681        if (httpContentType != null) {
682            final int i = httpContentType.indexOf(";");
683            if (i > -1) {
684                final String postMime = httpContentType.substring(i + 1);
685                final Matcher m = CHARSET_PATTERN.matcher(postMime);
686                encoding = m.find() ? m.group(1) : null;
687                encoding = encoding != null ? encoding.toUpperCase(Locale.US) : null;
688            }
689        }
690        return encoding;
691    }
692
693    public static final Pattern ENCODING_PATTERN = Pattern.compile(
694            "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))",
695            Pattern.MULTILINE);
696
697    /**
698     * Returns the encoding declared in the <?xml encoding=...?>, NULL if none.
699     *
700     * @param is InputStream to create the reader from.
701     * @param guessedEnc guessed encoding
702     * @return the encoding declared in the <?xml encoding=...?>
703     * @throws IOException thrown if there is a problem reading the stream.
704     */
705    private static String getXmlProlog(final InputStream is, final String guessedEnc)
706            throws IOException {
707        String encoding = null;
708        if (guessedEnc != null) {
709            final byte[] bytes = new byte[BUFFER_SIZE];
710            is.mark(BUFFER_SIZE);
711            int offset = 0;
712            int max = BUFFER_SIZE;
713            int c = is.read(bytes, offset, max);
714            int firstGT = -1;
715            String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning)
716            while (c != -1 && firstGT == -1 && offset < BUFFER_SIZE) {
717                offset += c;
718                max -= c;
719                c = is.read(bytes, offset, max);
720                xmlProlog = new String(bytes, 0, offset, guessedEnc);
721                firstGT = xmlProlog.indexOf('>');
722            }
723            if (firstGT == -1) {
724                if (c == -1) {
725                    throw new IOException("Unexpected end of XML stream");
726                } else {
727                    throw new IOException(
728                            "XML prolog or ROOT element not found on first "
729                                    + offset + " bytes");
730                }
731            }
732            final int bytesRead = offset;
733            if (bytesRead > 0) {
734                is.reset();
735                final BufferedReader bReader = new BufferedReader(new StringReader(
736                        xmlProlog.substring(0, firstGT + 1)));
737                final StringBuffer prolog = new StringBuffer();
738                String line = bReader.readLine();
739                while (line != null) {
740                    prolog.append(line);
741                    line = bReader.readLine();
742                }
743                final Matcher m = ENCODING_PATTERN.matcher(prolog);
744                if (m.find()) {
745                    encoding = m.group(1).toUpperCase();
746                    encoding = encoding.substring(1, encoding.length() - 1);
747                }
748            }
749        }
750        return encoding;
751    }
752
753    /**
754     * Indicates if the MIME type belongs to the APPLICATION XML family.
755     *
756     * @param mime The mime type
757     * @return true if the mime type belongs to the APPLICATION XML family,
758     * otherwise false
759     */
760    static boolean isAppXml(final String mime) {
761        return mime != null &&
762               (mime.equals("application/xml") ||
763                mime.equals("application/xml-dtd") ||
764                mime.equals("application/xml-external-parsed-entity") ||
765               mime.startsWith("application/") && mime.endsWith("+xml"));
766    }
767
768    /**
769     * Indicates if the MIME type belongs to the TEXT XML family.
770     *
771     * @param mime The mime type
772     * @return true if the mime type belongs to the TEXT XML family,
773     * otherwise false
774     */
775    static boolean isTextXml(final String mime) {
776        return mime != null &&
777              (mime.equals("text/xml") ||
778               mime.equals("text/xml-external-parsed-entity") ||
779              mime.startsWith("text/") && mime.endsWith("+xml"));
780    }
781
782    private static final String RAW_EX_1 =
783        "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
784
785    private static final String RAW_EX_2 =
786        "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";
787
788    private static final String HTTP_EX_1 =
789        "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL";
790
791    private static final String HTTP_EX_2 =
792        "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";
793
794    private static final String HTTP_EX_3 =
795        "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME";
796
797}