001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.io.input;
018    
019    import java.io.BufferedInputStream;
020    import java.io.BufferedReader;
021    import java.io.File;
022    import java.io.FileInputStream;
023    import java.io.IOException;
024    import java.io.InputStream;
025    import java.io.InputStreamReader;
026    import java.io.Reader;
027    import java.io.StringReader;
028    import java.net.HttpURLConnection;
029    import java.net.URL;
030    import java.net.URLConnection;
031    import java.text.MessageFormat;
032    import java.util.Locale;
033    import java.util.regex.Matcher;
034    import java.util.regex.Pattern;
035    
036    import org.apache.commons.io.ByteOrderMark;
037    
038    /**
039     * Character stream that handles all the necessary Voodo to figure out the
040     * charset encoding of the XML document within the stream.
041     * <p>
042     * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader.
043     * This one IS a character stream.
044     * <p>
045     * All this has to be done without consuming characters from the stream, if not
046     * the XML parser will not recognized the document as a valid XML. This is not
047     * 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers
048     * right now, XmlStreamReader handles it and things work in all parsers).
049     * <p>
050     * The XmlStreamReader class handles the charset encoding of XML documents in
051     * Files, raw streams and HTTP streams by offering a wide set of constructors.
052     * <p>
053     * By default the charset encoding detection is lenient, the constructor with
054     * the lenient flag can be used for an script (following HTTP MIME and XML
055     * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a
056     * href="http://diveintomark.org/archives/2004/02/13/xml-media-types">
057     * Determining the character encoding of a feed</a>.
058     * <p>
059     * Originally developed for <a href="http://rome.dev.java.net">ROME</a> under
060     * Apache License 2.0.
061     *
062     * @version $Id: XmlStreamReader.java 1346400 2012-06-05 14:48:01Z ggregory $
063     * @see org.apache.commons.io.output.XmlStreamWriter
064     * @since 2.0
065     */
066    public class XmlStreamReader extends Reader {
067        private static final int BUFFER_SIZE = 4096;
068    
069        private static final String UTF_8 = "UTF-8";
070    
071        private static final String US_ASCII = "US-ASCII";
072    
073        private static final String UTF_16BE = "UTF-16BE";
074    
075        private static final String UTF_16LE = "UTF-16LE";
076    
077        private static final String UTF_32BE = "UTF-32BE";
078    
079        private static final String UTF_32LE = "UTF-32LE";
080    
081        private static final String UTF_16 = "UTF-16";
082    
083        private static final String UTF_32 = "UTF-32";
084    
085        private static final String EBCDIC = "CP1047";
086    
087        private static final ByteOrderMark[] BOMS = new ByteOrderMark[] {
088            ByteOrderMark.UTF_8,
089            ByteOrderMark.UTF_16BE,
090            ByteOrderMark.UTF_16LE,
091            ByteOrderMark.UTF_32BE,
092            ByteOrderMark.UTF_32LE
093        };
094        
095        // UTF_16LE and UTF_32LE have the same two starting BOM bytes.
096        private static final ByteOrderMark[] XML_GUESS_BYTES = new ByteOrderMark[] {
097            new ByteOrderMark(UTF_8,    0x3C, 0x3F, 0x78, 0x6D),
098            new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F),
099            new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00),
100            new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C, 
101                    0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D),
102            new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00, 
103                    0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00),
104            new ByteOrderMark(EBCDIC,   0x4C, 0x6F, 0xA7, 0x94)
105        };
106    
107        private final Reader reader;
108    
109        private final String encoding;
110    
111        private final String defaultEncoding;
112    
113        /**
114         * Returns the default encoding to use if none is set in HTTP content-type,
115         * XML prolog and the rules based on content-type are not adequate.
116         * <p>
117         * If it is NULL the content-type based rules are used.
118         *
119         * @return the default encoding to use.
120         */
121        public String getDefaultEncoding() {
122            return defaultEncoding;
123        }
124    
125        /**
126         * Creates a Reader for a File.
127         * <p>
128         * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset,
129         * if this is also missing defaults to UTF-8.
130         * <p>
131         * It does a lenient charset encoding detection, check the constructor with
132         * the lenient parameter for details.
133         *
134         * @param file File to create a Reader from.
135         * @throws IOException thrown if there is a problem reading the file.
136         */
137        public XmlStreamReader(File file) throws IOException {
138            this(new FileInputStream(file));
139        }
140    
141        /**
142         * Creates a Reader for a raw InputStream.
143         * <p>
144         * It follows the same logic used for files.
145         * <p>
146         * It does a lenient charset encoding detection, check the constructor with
147         * the lenient parameter for details.
148         *
149         * @param is InputStream to create a Reader from.
150         * @throws IOException thrown if there is a problem reading the stream.
151         */
152        public XmlStreamReader(InputStream is) throws IOException {
153            this(is, true);
154        }
155    
156        /**
157         * Creates a Reader for a raw InputStream.
158         * <p>
159         * It follows the same logic used for files.
160         * <p>
161         * If lenient detection is indicated and the detection above fails as per
162         * specifications it then attempts the following:
163         * <p>
164         * If the content type was 'text/html' it replaces it with 'text/xml' and
165         * tries the detection again.
166         * <p>
167         * Else if the XML prolog had a charset encoding that encoding is used.
168         * <p>
169         * Else if the content type had a charset encoding that encoding is used.
170         * <p>
171         * Else 'UTF-8' is used.
172         * <p>
173         * If lenient detection is indicated an XmlStreamReaderException is never
174         * thrown.
175         *
176         * @param is InputStream to create a Reader from.
177         * @param lenient indicates if the charset encoding detection should be
178         *        relaxed.
179         * @throws IOException thrown if there is a problem reading the stream.
180         * @throws XmlStreamReaderException thrown if the charset encoding could not
181         *         be determined according to the specs.
182         */
183        public XmlStreamReader(InputStream is, boolean lenient) throws IOException {
184            this(is, lenient, null);
185        }
186    
187        /**
188         * Creates a Reader for a raw InputStream.
189         * <p>
190         * It follows the same logic used for files.
191         * <p>
192         * If lenient detection is indicated and the detection above fails as per
193         * specifications it then attempts the following:
194         * <p>
195         * If the content type was 'text/html' it replaces it with 'text/xml' and
196         * tries the detection again.
197         * <p>
198         * Else if the XML prolog had a charset encoding that encoding is used.
199         * <p>
200         * Else if the content type had a charset encoding that encoding is used.
201         * <p>
202         * Else 'UTF-8' is used.
203         * <p>
204         * If lenient detection is indicated an XmlStreamReaderException is never
205         * thrown.
206         *
207         * @param is InputStream to create a Reader from.
208         * @param lenient indicates if the charset encoding detection should be
209         *        relaxed.
210         * @param defaultEncoding The default encoding
211         * @throws IOException thrown if there is a problem reading the stream.
212         * @throws XmlStreamReaderException thrown if the charset encoding could not
213         *         be determined according to the specs.
214         */
215        public XmlStreamReader(InputStream is, boolean lenient, String defaultEncoding) throws IOException {
216            this.defaultEncoding = defaultEncoding;
217            BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS);
218            BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
219            this.encoding = doRawStream(bom, pis, lenient);
220            this.reader = new InputStreamReader(pis, encoding);
221        }
222    
223        /**
224         * Creates a Reader using the InputStream of a URL.
225         * <p>
226         * If the URL is not of type HTTP and there is not 'content-type' header in
227         * the fetched data it uses the same logic used for Files.
228         * <p>
229         * If the URL is a HTTP Url or there is a 'content-type' header in the
230         * fetched data it uses the same logic used for an InputStream with
231         * content-type.
232         * <p>
233         * It does a lenient charset encoding detection, check the constructor with
234         * the lenient parameter for details.
235         *
236         * @param url URL to create a Reader from.
237         * @throws IOException thrown if there is a problem reading the stream of
238         *         the URL.
239         */
240        public XmlStreamReader(URL url) throws IOException {
241            this(url.openConnection(), null);
242        }
243    
244        /**
245         * Creates a Reader using the InputStream of a URLConnection.
246         * <p>
247         * If the URLConnection is not of type HttpURLConnection and there is not
248         * 'content-type' header in the fetched data it uses the same logic used for
249         * files.
250         * <p>
251         * If the URLConnection is a HTTP Url or there is a 'content-type' header in
252         * the fetched data it uses the same logic used for an InputStream with
253         * content-type.
254         * <p>
255         * It does a lenient charset encoding detection, check the constructor with
256         * the lenient parameter for details.
257         *
258         * @param conn URLConnection to create a Reader from.
259         * @param defaultEncoding The default encoding
260         * @throws IOException thrown if there is a problem reading the stream of
261         *         the URLConnection.
262         */
263        public XmlStreamReader(URLConnection conn, String defaultEncoding) throws IOException {
264            this.defaultEncoding = defaultEncoding;
265            boolean lenient = true;
266            String contentType = conn.getContentType();
267            InputStream is = conn.getInputStream();
268            BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS);
269            BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
270            if (conn instanceof HttpURLConnection || contentType != null) {
271                this.encoding = doHttpStream(bom, pis, contentType, lenient);
272            } else {
273                this.encoding = doRawStream(bom, pis, lenient);
274            }
275            this.reader = new InputStreamReader(pis, encoding);
276        }
277    
278        /**
279         * Creates a Reader using an InputStream an the associated content-type
280         * header.
281         * <p>
282         * First it checks if the stream has BOM. If there is not BOM checks the
283         * content-type encoding. If there is not content-type encoding checks the
284         * XML prolog encoding. If there is not XML prolog encoding uses the default
285         * encoding mandated by the content-type MIME type.
286         * <p>
287         * It does a lenient charset encoding detection, check the constructor with
288         * the lenient parameter for details.
289         *
290         * @param is InputStream to create the reader from.
291         * @param httpContentType content-type header to use for the resolution of
292         *        the charset encoding.
293         * @throws IOException thrown if there is a problem reading the file.
294         */
295        public XmlStreamReader(InputStream is, String httpContentType)
296                throws IOException {
297            this(is, httpContentType, true);
298        }
299    
300        /**
301         * Creates a Reader using an InputStream an the associated content-type
302         * header. This constructor is lenient regarding the encoding detection.
303         * <p>
304         * First it checks if the stream has BOM. If there is not BOM checks the
305         * content-type encoding. If there is not content-type encoding checks the
306         * XML prolog encoding. If there is not XML prolog encoding uses the default
307         * encoding mandated by the content-type MIME type.
308         * <p>
309         * If lenient detection is indicated and the detection above fails as per
310         * specifications it then attempts the following:
311         * <p>
312         * If the content type was 'text/html' it replaces it with 'text/xml' and
313         * tries the detection again.
314         * <p>
315         * Else if the XML prolog had a charset encoding that encoding is used.
316         * <p>
317         * Else if the content type had a charset encoding that encoding is used.
318         * <p>
319         * Else 'UTF-8' is used.
320         * <p>
321         * If lenient detection is indicated an XmlStreamReaderException is never
322         * thrown.
323         *
324         * @param is InputStream to create the reader from.
325         * @param httpContentType content-type header to use for the resolution of
326         *        the charset encoding.
327         * @param lenient indicates if the charset encoding detection should be
328         *        relaxed.
329         * @param defaultEncoding The default encoding
330         * @throws IOException thrown if there is a problem reading the file.
331         * @throws XmlStreamReaderException thrown if the charset encoding could not
332         *         be determined according to the specs.
333         */
334        public XmlStreamReader(InputStream is, String httpContentType,
335                boolean lenient, String defaultEncoding) throws IOException {
336            this.defaultEncoding = defaultEncoding;
337            BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS);
338            BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
339            this.encoding = doHttpStream(bom, pis, httpContentType, lenient);
340            this.reader = new InputStreamReader(pis, encoding);
341        }
342    
343        /**
344         * Creates a Reader using an InputStream an the associated content-type
345         * header. This constructor is lenient regarding the encoding detection.
346         * <p>
347         * First it checks if the stream has BOM. If there is not BOM checks the
348         * content-type encoding. If there is not content-type encoding checks the
349         * XML prolog encoding. If there is not XML prolog encoding uses the default
350         * encoding mandated by the content-type MIME type.
351         * <p>
352         * If lenient detection is indicated and the detection above fails as per
353         * specifications it then attempts the following:
354         * <p>
355         * If the content type was 'text/html' it replaces it with 'text/xml' and
356         * tries the detection again.
357         * <p>
358         * Else if the XML prolog had a charset encoding that encoding is used.
359         * <p>
360         * Else if the content type had a charset encoding that encoding is used.
361         * <p>
362         * Else 'UTF-8' is used.
363         * <p>
364         * If lenient detection is indicated an XmlStreamReaderException is never
365         * thrown.
366         *
367         * @param is InputStream to create the reader from.
368         * @param httpContentType content-type header to use for the resolution of
369         *        the charset encoding.
370         * @param lenient indicates if the charset encoding detection should be
371         *        relaxed.
372         * @throws IOException thrown if there is a problem reading the file.
373         * @throws XmlStreamReaderException thrown if the charset encoding could not
374         *         be determined according to the specs.
375         */
376        public XmlStreamReader(InputStream is, String httpContentType,
377                boolean lenient) throws IOException {
378            this(is, httpContentType, lenient, null);
379        }
380    
381        /**
382         * Returns the charset encoding of the XmlStreamReader.
383         *
384         * @return charset encoding.
385         */
386        public String getEncoding() {
387            return encoding;
388        }
389    
390        /**
391         * Invokes the underlying reader's <code>read(char[], int, int)</code> method.
392         * @param buf the buffer to read the characters into
393         * @param offset The start offset
394         * @param len The number of bytes to read
395         * @return the number of characters read or -1 if the end of stream
396         * @throws IOException if an I/O error occurs
397         */
398        @Override
399        public int read(char[] buf, int offset, int len) throws IOException {
400            return reader.read(buf, offset, len);
401        }
402    
403        /**
404         * Closes the XmlStreamReader stream.
405         *
406         * @throws IOException thrown if there was a problem closing the stream.
407         */
408        @Override
409        public void close() throws IOException {
410            reader.close();
411        }
412    
413        /**
414         * Process the raw stream.
415         *
416         * @param bom BOMInputStream to detect byte order marks
417         * @param pis BOMInputStream to guess XML encoding
418         * @param lenient indicates if the charset encoding detection should be
419         *        relaxed.
420         * @return the encoding to be used
421         * @throws IOException thrown if there is a problem reading the stream.
422         */
423        private String doRawStream(BOMInputStream bom, BOMInputStream pis, boolean lenient)
424                throws IOException {
425            String bomEnc      = bom.getBOMCharsetName();
426            String xmlGuessEnc = pis.getBOMCharsetName();
427            String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
428            try {
429                return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
430            } catch (XmlStreamReaderException ex) {
431                if (lenient) {
432                    return doLenientDetection(null, ex);
433                } else {
434                    throw ex;
435                }
436            }
437        }
438    
439        /**
440         * Process a HTTP stream.
441         *
442         * @param bom BOMInputStream to detect byte order marks
443         * @param pis BOMInputStream to guess XML encoding
444         * @param httpContentType The HTTP content type
445         * @param lenient indicates if the charset encoding detection should be
446         *        relaxed.
447         * @return the encoding to be used
448         * @throws IOException thrown if there is a problem reading the stream.
449         */
450        private String doHttpStream(BOMInputStream bom, BOMInputStream pis, String httpContentType,
451                boolean lenient) throws IOException {
452            String bomEnc      = bom.getBOMCharsetName();
453            String xmlGuessEnc = pis.getBOMCharsetName();
454            String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
455            try {
456                return calculateHttpEncoding(httpContentType, bomEnc,
457                        xmlGuessEnc, xmlEnc, lenient);
458            } catch (XmlStreamReaderException ex) {
459                if (lenient) {
460                    return doLenientDetection(httpContentType, ex);
461                } else {
462                    throw ex;
463                }
464            }
465        }
466    
467        /**
468         * Do lenient detection.
469         *
470         * @param httpContentType content-type header to use for the resolution of
471         *        the charset encoding.
472         * @param ex The thrown exception
473         * @return the encoding
474         * @throws IOException thrown if there is a problem reading the stream.
475         */
476        private String doLenientDetection(String httpContentType,
477                XmlStreamReaderException ex) throws IOException {
478            if (httpContentType != null && httpContentType.startsWith("text/html")) {
479                httpContentType = httpContentType.substring("text/html".length());
480                httpContentType = "text/xml" + httpContentType;
481                try {
482                    return calculateHttpEncoding(httpContentType, ex.getBomEncoding(),
483                            ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true);
484                } catch (XmlStreamReaderException ex2) {
485                    ex = ex2;
486                }
487            }
488            String encoding = ex.getXmlEncoding();
489            if (encoding == null) {
490                encoding = ex.getContentTypeEncoding();
491            }
492            if (encoding == null) {
493                encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
494            }
495            return encoding;
496        }
497    
498        /**
499         * Calculate the raw encoding.
500         *
501         * @param bomEnc BOM encoding
502         * @param xmlGuessEnc XML Guess encoding
503         * @param xmlEnc XML encoding
504         * @return the raw encoding
505         * @throws IOException thrown if there is a problem reading the stream.
506         */
507        String calculateRawEncoding(String bomEnc, String xmlGuessEnc,
508                String xmlEnc) throws IOException {
509    
510            // BOM is Null
511            if (bomEnc == null) {
512                if (xmlGuessEnc == null || xmlEnc == null) {
513                    return defaultEncoding == null ? UTF_8 : defaultEncoding;
514                }
515                if (xmlEnc.equals(UTF_16) &&
516                   (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
517                    return xmlGuessEnc;
518                }
519                return xmlEnc;
520            }
521    
522            // BOM is UTF-8
523            if (bomEnc.equals(UTF_8)) {
524                if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
525                    String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
526                    throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
527                }
528                if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
529                    String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
530                    throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
531                }
532                return bomEnc;
533            }
534    
535            // BOM is UTF-16BE or UTF-16LE
536            if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
537                if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
538                    String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
539                    throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
540                }
541                if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
542                    String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
543                    throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
544                }
545                return bomEnc;
546            }
547    
548            // BOM is UTF-32BE or UTF-32LE
549            if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) {
550                if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
551                    String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
552                    throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
553                }
554                if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) {
555                    String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
556                    throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
557                }
558                return bomEnc;
559            }
560    
561            // BOM is something else
562            String msg = MessageFormat.format(RAW_EX_2, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
563            throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
564        }
565    
566    
567        /**
568         * Calculate the HTTP encoding.
569         *
570         * @param httpContentType The HTTP content type
571         * @param bomEnc BOM encoding
572         * @param xmlGuessEnc XML Guess encoding
573         * @param xmlEnc XML encoding
574         * @param lenient indicates if the charset encoding detection should be
575         *        relaxed.
576         * @return the HTTP encoding
577         * @throws IOException thrown if there is a problem reading the stream.
578         */
579        String calculateHttpEncoding(String httpContentType,
580                String bomEnc, String xmlGuessEnc, String xmlEnc,
581                boolean lenient) throws IOException {
582    
583            // Lenient and has XML encoding
584            if (lenient && xmlEnc != null) {
585                return xmlEnc;
586            }
587    
588            // Determine mime/encoding content types from HTTP Content Type
589            String cTMime = getContentTypeMime(httpContentType);
590            String cTEnc  = getContentTypeEncoding(httpContentType);
591            boolean appXml  = isAppXml(cTMime);
592            boolean textXml = isTextXml(cTMime);
593    
594            // Mime type NOT "application/xml" or "text/xml"
595            if (!appXml && !textXml) {
596                String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
597                throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
598            }
599    
600            // No content type encoding
601            if (cTEnc == null) {
602                if (appXml) {
603                    return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
604                } else {
605                    return defaultEncoding == null ? US_ASCII : defaultEncoding;
606                }
607            }
608    
609            // UTF-16BE or UTF-16LE content type encoding
610            if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
611                if (bomEnc != null) {
612                    String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
613                    throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
614                }
615                return cTEnc;
616            }
617    
618            // UTF-16 content type encoding
619            if (cTEnc.equals(UTF_16)) {
620                if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
621                    return bomEnc;
622                }
623                String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
624                throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
625            }
626    
627            // UTF-32BE or UTF-132E content type encoding
628            if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) {
629                if (bomEnc != null) {
630                    String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
631                    throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
632                }
633                return cTEnc;
634            }
635    
636            // UTF-32 content type encoding
637            if (cTEnc.equals(UTF_32)) {
638                if (bomEnc != null && bomEnc.startsWith(UTF_32)) {
639                    return bomEnc;
640                }
641                String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
642                throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
643            }
644    
645            return cTEnc;
646        }
647    
648        /**
649         * Returns MIME type or NULL if httpContentType is NULL.
650         *
651         * @param httpContentType the HTTP content type
652         * @return The mime content type
653         */
654        static String getContentTypeMime(String httpContentType) {
655            String mime = null;
656            if (httpContentType != null) {
657                int i = httpContentType.indexOf(";");
658                if (i >= 0) {
659                    mime = httpContentType.substring(0, i);
660                } else {
661                    mime = httpContentType;
662                }
663                mime = mime.trim();
664            }
665            return mime;
666        }
667    
668        private static final Pattern CHARSET_PATTERN = Pattern
669                .compile("charset=[\"']?([.[^; \"']]*)[\"']?");
670    
671        /**
672         * Returns charset parameter value, NULL if not present, NULL if
673         * httpContentType is NULL.
674         *
675         * @param httpContentType the HTTP content type
676         * @return The content type encoding (upcased)
677         */
678        static String getContentTypeEncoding(String httpContentType) {
679            String encoding = null;
680            if (httpContentType != null) {
681                int i = httpContentType.indexOf(";");
682                if (i > -1) {
683                    String postMime = httpContentType.substring(i + 1);
684                    Matcher m = CHARSET_PATTERN.matcher(postMime);
685                    encoding = m.find() ? m.group(1) : null;
686                    encoding = encoding != null ? encoding.toUpperCase(Locale.US) : null;
687                }
688            }
689            return encoding;
690        }
691    
692        public static final Pattern ENCODING_PATTERN = Pattern.compile(
693                "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))",
694                Pattern.MULTILINE);
695    
696        /**
697         * Returns the encoding declared in the <?xml encoding=...?>, NULL if none.
698         *
699         * @param is InputStream to create the reader from.
700         * @param guessedEnc guessed encoding
701         * @return the encoding declared in the <?xml encoding=...?>
702         * @throws IOException thrown if there is a problem reading the stream.
703         */
704        private static String getXmlProlog(InputStream is, String guessedEnc)
705                throws IOException {
706            String encoding = null;
707            if (guessedEnc != null) {
708                byte[] bytes = new byte[BUFFER_SIZE];
709                is.mark(BUFFER_SIZE);
710                int offset = 0;
711                int max = BUFFER_SIZE;
712                int c = is.read(bytes, offset, max);
713                int firstGT = -1;
714                String xmlProlog = null;
715                while (c != -1 && firstGT == -1 && offset < BUFFER_SIZE) {
716                    offset += c;
717                    max -= c;
718                    c = is.read(bytes, offset, max);
719                    xmlProlog = new String(bytes, 0, offset, guessedEnc);
720                    firstGT = xmlProlog.indexOf('>');
721                }
722                if (firstGT == -1) {
723                    if (c == -1) {
724                        throw new IOException("Unexpected end of XML stream");
725                    } else {
726                        throw new IOException(
727                                "XML prolog or ROOT element not found on first "
728                                        + offset + " bytes");
729                    }
730                }
731                int bytesRead = offset;
732                if (bytesRead > 0) {
733                    is.reset();
734                    BufferedReader bReader = new BufferedReader(new StringReader(
735                            xmlProlog.substring(0, firstGT + 1)));
736                    StringBuffer prolog = new StringBuffer();
737                    String line = bReader.readLine();
738                    while (line != null) {
739                        prolog.append(line);
740                        line = bReader.readLine();
741                    }
742                    Matcher m = ENCODING_PATTERN.matcher(prolog);
743                    if (m.find()) {
744                        encoding = m.group(1).toUpperCase();
745                        encoding = encoding.substring(1, encoding.length() - 1);
746                    }
747                }
748            }
749            return encoding;
750        }
751    
752        /**
753         * Indicates if the MIME type belongs to the APPLICATION XML family.
754         * 
755         * @param mime The mime type
756         * @return true if the mime type belongs to the APPLICATION XML family,
757         * otherwise false
758         */
759        static boolean isAppXml(String mime) {
760            return mime != null &&
761                   (mime.equals("application/xml") || 
762                    mime.equals("application/xml-dtd") ||
763                    mime.equals("application/xml-external-parsed-entity") ||
764                   mime.startsWith("application/") && mime.endsWith("+xml"));
765        }
766    
767        /**
768         * Indicates if the MIME type belongs to the TEXT XML family.
769         * 
770         * @param mime The mime type
771         * @return true if the mime type belongs to the TEXT XML family,
772         * otherwise false
773         */
774        static boolean isTextXml(String mime) {
775            return mime != null &&
776                  (mime.equals("text/xml") ||
777                   mime.equals("text/xml-external-parsed-entity") ||
778                  mime.startsWith("text/") && mime.endsWith("+xml"));
779        }
780    
781        private static final String RAW_EX_1 =
782            "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
783    
784        private static final String RAW_EX_2 =
785            "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";
786    
787        private static final String HTTP_EX_1 =
788            "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL";
789    
790        private static final String HTTP_EX_2 =
791            "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";
792    
793        private static final String HTTP_EX_3 =
794            "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME";
795    
796    }