001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.io.input;
018    
019    import java.io.BufferedInputStream;
020    import java.io.BufferedReader;
021    import java.io.File;
022    import java.io.FileInputStream;
023    import java.io.IOException;
024    import java.io.InputStream;
025    import java.io.InputStreamReader;
026    import java.io.Reader;
027    import java.io.StringReader;
028    import java.net.HttpURLConnection;
029    import java.net.URL;
030    import java.net.URLConnection;
031    import java.text.MessageFormat;
032    import java.util.regex.Matcher;
033    import java.util.regex.Pattern;
034    
035    import org.apache.commons.io.ByteOrderMark;
036    
037    /**
038     * Character stream that handles all the necessary Voodo to figure out the
039     * charset encoding of the XML document within the stream.
040     * <p>
041     * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader.
042     * This one IS a character stream.
043     * <p>
044     * All this has to be done without consuming characters from the stream, if not
045     * the XML parser will not recognized the document as a valid XML. This is not
046     * 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers
047     * right now, XmlStreamReader handles it and things work in all parsers).
048     * <p>
049     * The XmlStreamReader class handles the charset encoding of XML documents in
050     * Files, raw streams and HTTP streams by offering a wide set of constructors.
051     * <p>
052     * By default the charset encoding detection is lenient, the constructor with
053     * the lenient flag can be used for an script (following HTTP MIME and XML
054     * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a
055     * href="http://diveintomark.org/archives/2004/02/13/xml-media-types">
056     * Determining the character encoding of a feed</a>.
057     * <p>
058     * Originally developed for <a href="http://rome.dev.java.net">ROME</a> under
059     * Apache License 2.0.
060     *
061     * @author Alejandro Abdelnur
062     * @version $Id: XmlStreamReader.java 1021884 2010-10-12 18:49:16Z ggregory $
063     * @see org.apache.commons.io.output.XmlStreamWriter
064     * @since Commons IO 2.0
065     */
066    public class XmlStreamReader extends Reader {
067        private static final int BUFFER_SIZE = 4096;
068    
069        private static final String UTF_8 = "UTF-8";
070    
071        private static final String US_ASCII = "US-ASCII";
072    
073        private static final String UTF_16BE = "UTF-16BE";
074    
075        private static final String UTF_16LE = "UTF-16LE";
076    
077        private static final String UTF_16 = "UTF-16";
078    
079        private static final String EBCDIC = "CP1047";
080    
081        private static final ByteOrderMark[] BOMS = new ByteOrderMark[] {
082            ByteOrderMark.UTF_8,
083            ByteOrderMark.UTF_16BE,
084            ByteOrderMark.UTF_16LE
085        };
086        private static final ByteOrderMark[] XML_GUESS_BYTES = new ByteOrderMark[] {
087            new ByteOrderMark(UTF_8,    0x3C, 0x3F, 0x78, 0x6D),
088            new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F),
089            new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00),
090            new ByteOrderMark(EBCDIC,   0x4C, 0x6F, 0xA7, 0x94)
091        };
092    
093    
094        private final Reader reader;
095    
096        private final String encoding;
097    
098        private final String defaultEncoding;
099    
100        /**
101         * Returns the default encoding to use if none is set in HTTP content-type,
102         * XML prolog and the rules based on content-type are not adequate.
103         * <p>
104         * If it is NULL the content-type based rules are used.
105         *
106         * @return the default encoding to use.
107         */
108        public String getDefaultEncoding() {
109            return defaultEncoding;
110        }
111    
112        /**
113         * Creates a Reader for a File.
114         * <p>
115         * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset,
116         * if this is also missing defaults to UTF-8.
117         * <p>
118         * It does a lenient charset encoding detection, check the constructor with
119         * the lenient parameter for details.
120         *
121         * @param file File to create a Reader from.
122         * @throws IOException thrown if there is a problem reading the file.
123         */
124        public XmlStreamReader(File file) throws IOException {
125            this(new FileInputStream(file));
126        }
127    
128        /**
129         * Creates a Reader for a raw InputStream.
130         * <p>
131         * It follows the same logic used for files.
132         * <p>
133         * It does a lenient charset encoding detection, check the constructor with
134         * the lenient parameter for details.
135         *
136         * @param is InputStream to create a Reader from.
137         * @throws IOException thrown if there is a problem reading the stream.
138         */
139        public XmlStreamReader(InputStream is) throws IOException {
140            this(is, true);
141        }
142    
143        /**
144         * Creates a Reader for a raw InputStream.
145         * <p>
146         * It follows the same logic used for files.
147         * <p>
148         * If lenient detection is indicated and the detection above fails as per
149         * specifications it then attempts the following:
150         * <p>
151         * If the content type was 'text/html' it replaces it with 'text/xml' and
152         * tries the detection again.
153         * <p>
154         * Else if the XML prolog had a charset encoding that encoding is used.
155         * <p>
156         * Else if the content type had a charset encoding that encoding is used.
157         * <p>
158         * Else 'UTF-8' is used.
159         * <p>
160         * If lenient detection is indicated an XmlStreamReaderException is never
161         * thrown.
162         *
163         * @param is InputStream to create a Reader from.
164         * @param lenient indicates if the charset encoding detection should be
165         *        relaxed.
166         * @throws IOException thrown if there is a problem reading the stream.
167         * @throws XmlStreamReaderException thrown if the charset encoding could not
168         *         be determined according to the specs.
169         */
170        public XmlStreamReader(InputStream is, boolean lenient) throws IOException {
171            this(is, lenient, null);
172        }
173    
174        /**
175         * Creates a Reader for a raw InputStream.
176         * <p>
177         * It follows the same logic used for files.
178         * <p>
179         * If lenient detection is indicated and the detection above fails as per
180         * specifications it then attempts the following:
181         * <p>
182         * If the content type was 'text/html' it replaces it with 'text/xml' and
183         * tries the detection again.
184         * <p>
185         * Else if the XML prolog had a charset encoding that encoding is used.
186         * <p>
187         * Else if the content type had a charset encoding that encoding is used.
188         * <p>
189         * Else 'UTF-8' is used.
190         * <p>
191         * If lenient detection is indicated an XmlStreamReaderException is never
192         * thrown.
193         *
194         * @param is InputStream to create a Reader from.
195         * @param lenient indicates if the charset encoding detection should be
196         *        relaxed.
197         * @param defaultEncoding The default encoding
198         * @throws IOException thrown if there is a problem reading the stream.
199         * @throws XmlStreamReaderException thrown if the charset encoding could not
200         *         be determined according to the specs.
201         */
202        public XmlStreamReader(InputStream is, boolean lenient, String defaultEncoding) throws IOException {
203            this.defaultEncoding = defaultEncoding;
204            this.encoding = doRawStream(is, lenient);
205            this.reader = new InputStreamReader(is, encoding);
206        }
207    
208        /**
209         * Creates a Reader using the InputStream of a URL.
210         * <p>
211         * If the URL is not of type HTTP and there is not 'content-type' header in
212         * the fetched data it uses the same logic used for Files.
213         * <p>
214         * If the URL is a HTTP Url or there is a 'content-type' header in the
215         * fetched data it uses the same logic used for an InputStream with
216         * content-type.
217         * <p>
218         * It does a lenient charset encoding detection, check the constructor with
219         * the lenient parameter for details.
220         *
221         * @param url URL to create a Reader from.
222         * @throws IOException thrown if there is a problem reading the stream of
223         *         the URL.
224         */
225        public XmlStreamReader(URL url) throws IOException {
226            this(url.openConnection(), null);
227        }
228    
229        /**
230         * Creates a Reader using the InputStream of a URLConnection.
231         * <p>
232         * If the URLConnection is not of type HttpURLConnection and there is not
233         * 'content-type' header in the fetched data it uses the same logic used for
234         * files.
235         * <p>
236         * If the URLConnection is a HTTP Url or there is a 'content-type' header in
237         * the fetched data it uses the same logic used for an InputStream with
238         * content-type.
239         * <p>
240         * It does a lenient charset encoding detection, check the constructor with
241         * the lenient parameter for details.
242         *
243         * @param conn URLConnection to create a Reader from.
244         * @param defaultEncoding The default encoding
245         * @throws IOException thrown if there is a problem reading the stream of
246         *         the URLConnection.
247         */
248        public XmlStreamReader(URLConnection conn, String defaultEncoding) throws IOException {
249            this.defaultEncoding = defaultEncoding;
250            boolean lenient = true;
251            String contentType = conn.getContentType();
252            InputStream is = conn.getInputStream();
253            if (conn instanceof HttpURLConnection || contentType != null) {
254                this.encoding = doHttpStream(is, contentType, lenient);
255            } else {
256                this.encoding = doRawStream(is, lenient);
257            }
258            this.reader = new InputStreamReader(is, encoding);
259        }
260    
261        /**
262         * Creates a Reader using an InputStream an the associated content-type
263         * header.
264         * <p>
265         * First it checks if the stream has BOM. If there is not BOM checks the
266         * content-type encoding. If there is not content-type encoding checks the
267         * XML prolog encoding. If there is not XML prolog encoding uses the default
268         * encoding mandated by the content-type MIME type.
269         * <p>
270         * It does a lenient charset encoding detection, check the constructor with
271         * the lenient parameter for details.
272         *
273         * @param is InputStream to create the reader from.
274         * @param httpContentType content-type header to use for the resolution of
275         *        the charset encoding.
276         * @throws IOException thrown if there is a problem reading the file.
277         */
278        public XmlStreamReader(InputStream is, String httpContentType)
279                throws IOException {
280            this(is, httpContentType, true);
281        }
282    
283        /**
284         * Creates a Reader using an InputStream an the associated content-type
285         * header. This constructor is lenient regarding the encoding detection.
286         * <p>
287         * First it checks if the stream has BOM. If there is not BOM checks the
288         * content-type encoding. If there is not content-type encoding checks the
289         * XML prolog encoding. If there is not XML prolog encoding uses the default
290         * encoding mandated by the content-type MIME type.
291         * <p>
292         * If lenient detection is indicated and the detection above fails as per
293         * specifications it then attempts the following:
294         * <p>
295         * If the content type was 'text/html' it replaces it with 'text/xml' and
296         * tries the detection again.
297         * <p>
298         * Else if the XML prolog had a charset encoding that encoding is used.
299         * <p>
300         * Else if the content type had a charset encoding that encoding is used.
301         * <p>
302         * Else 'UTF-8' is used.
303         * <p>
304         * If lenient detection is indicated an XmlStreamReaderException is never
305         * thrown.
306         *
307         * @param is InputStream to create the reader from.
308         * @param httpContentType content-type header to use for the resolution of
309         *        the charset encoding.
310         * @param lenient indicates if the charset encoding detection should be
311         *        relaxed.
312         * @param defaultEncoding The default encoding
313         * @throws IOException thrown if there is a problem reading the file.
314         * @throws XmlStreamReaderException thrown if the charset encoding could not
315         *         be determined according to the specs.
316         */
317        public XmlStreamReader(InputStream is, String httpContentType,
318                boolean lenient, String defaultEncoding) throws IOException {
319            this.defaultEncoding = defaultEncoding;
320            this.encoding = doHttpStream(is, httpContentType, lenient);
321            this.reader = new InputStreamReader(is, encoding);
322        }
323    
324        /**
325         * Creates a Reader using an InputStream an the associated content-type
326         * header. This constructor is lenient regarding the encoding detection.
327         * <p>
328         * First it checks if the stream has BOM. If there is not BOM checks the
329         * content-type encoding. If there is not content-type encoding checks the
330         * XML prolog encoding. If there is not XML prolog encoding uses the default
331         * encoding mandated by the content-type MIME type.
332         * <p>
333         * If lenient detection is indicated and the detection above fails as per
334         * specifications it then attempts the following:
335         * <p>
336         * If the content type was 'text/html' it replaces it with 'text/xml' and
337         * tries the detection again.
338         * <p>
339         * Else if the XML prolog had a charset encoding that encoding is used.
340         * <p>
341         * Else if the content type had a charset encoding that encoding is used.
342         * <p>
343         * Else 'UTF-8' is used.
344         * <p>
345         * If lenient detection is indicated an XmlStreamReaderException is never
346         * thrown.
347         *
348         * @param is InputStream to create the reader from.
349         * @param httpContentType content-type header to use for the resolution of
350         *        the charset encoding.
351         * @param lenient indicates if the charset encoding detection should be
352         *        relaxed.
353         * @throws IOException thrown if there is a problem reading the file.
354         * @throws XmlStreamReaderException thrown if the charset encoding could not
355         *         be determined according to the specs.
356         */
357        public XmlStreamReader(InputStream is, String httpContentType,
358                boolean lenient) throws IOException {
359            this(is, httpContentType, lenient, null);
360        }
361    
362        /**
363         * Returns the charset encoding of the XmlStreamReader.
364         *
365         * @return charset encoding.
366         */
367        public String getEncoding() {
368            return encoding;
369        }
370    
371        /**
372         * Invokes the underlying reader's <code>read(char[], int, int)</code> method.
373         * @param buf the buffer to read the characters into
374         * @param offset The start offset
375         * @param len The number of bytes to read
376         * @return the number of characters read or -1 if the end of stream
377         * @throws IOException if an I/O error occurs
378         */
379        @Override
380        public int read(char[] buf, int offset, int len) throws IOException {
381            return reader.read(buf, offset, len);
382        }
383    
384        /**
385         * Closes the XmlStreamReader stream.
386         *
387         * @throws IOException thrown if there was a problem closing the stream.
388         */
389        @Override
390        public void close() throws IOException {
391            reader.close();
392        }
393    
394        /**
395         * Process the raw stream.
396         *
397         * @param is InputStream to create the reader from.
398         * @param lenient indicates if the charset encoding detection should be
399         *        relaxed.
400         * @return the encoding to be used
401         * @throws IOException thrown if there is a problem reading the stream.
402         */
403        private String doRawStream(InputStream is, boolean lenient)
404                throws IOException {
405            BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS);
406            BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
407            String bomEnc      = bom.getBOMCharsetName();
408            String xmlGuessEnc = pis.getBOMCharsetName();
409            String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
410            try {
411                return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
412            } catch (XmlStreamReaderException ex) {
413                if (lenient) {
414                    return doLenientDetection(null, is, ex);
415                } else {
416                    throw ex;
417                }
418            }
419        }
420    
421        /**
422         * Process a HTTP stream.
423         *
424         * @param is InputStream to create the reader from.
425         * @param httpContentType The HTTP content type
426         * @param lenient indicates if the charset encoding detection should be
427         *        relaxed.
428         * @return the encoding to be used
429         * @throws IOException thrown if there is a problem reading the stream.
430         */
431        private String doHttpStream(InputStream is, String httpContentType,
432                boolean lenient) throws IOException {
433            BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS);
434            BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
435            String bomEnc      = bom.getBOMCharsetName();
436            String xmlGuessEnc = pis.getBOMCharsetName();
437            String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
438            try {
439                return calculateHttpEncoding(httpContentType, bomEnc,
440                        xmlGuessEnc, xmlEnc, lenient);
441            } catch (XmlStreamReaderException ex) {
442                if (lenient) {
443                    return doLenientDetection(httpContentType, is, ex);
444                } else {
445                    throw ex;
446                }
447            }
448        }
449    
450        /**
451         * Do lenient detection.
452         *
453         * @param httpContentType content-type header to use for the resolution of
454         *        the charset encoding.
455         * @param is the unconsumed InputStream
456         * @param ex The thrown exception
457         * @return the encoding
458         * @throws IOException thrown if there is a problem reading the stream.
459         */
460        private String doLenientDetection(String httpContentType, InputStream is,
461                XmlStreamReaderException ex) throws IOException {
462            if (httpContentType != null && httpContentType.startsWith("text/html")) {
463                httpContentType = httpContentType.substring("text/html".length());
464                httpContentType = "text/xml" + httpContentType;
465                try {
466                    return calculateHttpEncoding(httpContentType, ex.getBomEncoding(),
467                            ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true);
468                } catch (XmlStreamReaderException ex2) {
469                    ex = ex2;
470                }
471            }
472            String encoding = ex.getXmlEncoding();
473            if (encoding == null) {
474                encoding = ex.getContentTypeEncoding();
475            }
476            if (encoding == null) {
477                encoding = (defaultEncoding == null) ? UTF_8 : defaultEncoding;
478            }
479            return encoding;
480        }
481    
482        /**
483         * Calculate the raw encoding.
484         *
485         * @param bomEnc BOM encoding
486         * @param xmlGuessEnc XML Guess encoding
487         * @param xmlEnc XML encoding
488         * @return the raw encoding
489         * @throws IOException thrown if there is a problem reading the stream.
490         */
491        String calculateRawEncoding(String bomEnc, String xmlGuessEnc,
492                String xmlEnc) throws IOException {
493    
494            // BOM is Null
495            if (bomEnc == null) {
496                if (xmlGuessEnc == null || xmlEnc == null) {
497                    return (defaultEncoding == null ? UTF_8 : defaultEncoding);
498                }
499                if (xmlEnc.equals(UTF_16) &&
500                   (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
501                    return xmlGuessEnc;
502                }
503                return xmlEnc;
504            }
505    
506            // BOM is UTF-8
507            if (bomEnc.equals(UTF_8)) {
508                if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
509                    String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
510                    throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
511                }
512                if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
513                    String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
514                    throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
515                }
516                return bomEnc;
517            }
518    
519            // BOM is UTF-16BE or UTF-16LE
520            if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
521                if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
522                    String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
523                    throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
524                }
525                if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
526                    String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
527                    throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
528                }
529                return bomEnc;
530            }
531    
532            // BOM is something else
533            String msg = MessageFormat.format(RAW_EX_2, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
534            throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
535        }
536    
537    
538        /**
539         * Calculate the HTTP encoding.
540         *
541         * @param httpContentType The HTTP content type
542         * @param bomEnc BOM encoding
543         * @param xmlGuessEnc XML Guess encoding
544         * @param xmlEnc XML encoding
545         * @param lenient indicates if the charset encoding detection should be
546         *        relaxed.
547         * @return the HTTP encoding
548         * @throws IOException thrown if there is a problem reading the stream.
549         */
550        String calculateHttpEncoding(String httpContentType,
551                String bomEnc, String xmlGuessEnc, String xmlEnc,
552                boolean lenient) throws IOException {
553    
554            // Lenient and has XML encoding
555            if (lenient && xmlEnc != null) {
556                return xmlEnc;
557            }
558    
559            // Determine mime/encoding content types from HTTP Content Type
560            String cTMime = getContentTypeMime(httpContentType);
561            String cTEnc  = getContentTypeEncoding(httpContentType);
562            boolean appXml  = isAppXml(cTMime);
563            boolean textXml = isTextXml(cTMime);
564    
565            // Mime type NOT "application/xml" or "text/xml"
566            if (!appXml && !textXml) {
567                String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
568                throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
569            }
570    
571            // No content type encoding
572            if (cTEnc == null) {
573                if (appXml) {
574                    return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
575                } else {
576                    return (defaultEncoding == null) ? US_ASCII : defaultEncoding;
577                }
578            }
579    
580            // UTF-16BE or UTF-16LE content type encoding
581            if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
582                if (bomEnc != null) {
583                    String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
584                    throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
585                }
586                return cTEnc;
587            }
588    
589            // UTF-16 content type encoding
590            if (cTEnc.equals(UTF_16)) {
591                if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
592                    return bomEnc;
593                }
594                String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
595                throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
596            }
597    
598            return cTEnc;
599        }
600    
601        /**
602         * Returns MIME type or NULL if httpContentType is NULL.
603         *
604         * @param httpContentType the HTTP content type
605         * @return The mime content type
606         */
607        static String getContentTypeMime(String httpContentType) {
608            String mime = null;
609            if (httpContentType != null) {
610                int i = httpContentType.indexOf(";");
611                if (i >= 0) {
612                    mime = httpContentType.substring(0, i);
613                } else {
614                    mime = httpContentType;
615                }
616                mime = mime.trim();
617            }
618            return mime;
619        }
620    
621        private static final Pattern CHARSET_PATTERN = Pattern
622                .compile("charset=[\"']?([.[^; \"']]*)[\"']?");
623    
624        /**
625         * Returns charset parameter value, NULL if not present, NULL if
626         * httpContentType is NULL.
627         *
628         * @param httpContentType the HTTP content type
629         * @return The content type encoding
630         */
631        static String getContentTypeEncoding(String httpContentType) {
632            String encoding = null;
633            if (httpContentType != null) {
634                int i = httpContentType.indexOf(";");
635                if (i > -1) {
636                    String postMime = httpContentType.substring(i + 1);
637                    Matcher m = CHARSET_PATTERN.matcher(postMime);
638                    encoding = (m.find()) ? m.group(1) : null;
639                    encoding = (encoding != null) ? encoding.toUpperCase() : null;
640                }
641            }
642            return encoding;
643        }
644    
645        public static final Pattern ENCODING_PATTERN = Pattern.compile(
646                "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))",
647                Pattern.MULTILINE);
648    
649        /**
650         * Returns the encoding declared in the <?xml encoding=...?>, NULL if none.
651         *
652         * @param is InputStream to create the reader from.
653         * @param guessedEnc guessed encoding
654         * @return the encoding declared in the <?xml encoding=...?>
655         * @throws IOException thrown if there is a problem reading the stream.
656         */
657        private static String getXmlProlog(InputStream is, String guessedEnc)
658                throws IOException {
659            String encoding = null;
660            if (guessedEnc != null) {
661                byte[] bytes = new byte[BUFFER_SIZE];
662                is.mark(BUFFER_SIZE);
663                int offset = 0;
664                int max = BUFFER_SIZE;
665                int c = is.read(bytes, offset, max);
666                int firstGT = -1;
667                String xmlProlog = null;
668                while (c != -1 && firstGT == -1 && offset < BUFFER_SIZE) {
669                    offset += c;
670                    max -= c;
671                    c = is.read(bytes, offset, max);
672                    xmlProlog = new String(bytes, 0, offset, guessedEnc);
673                    firstGT = xmlProlog.indexOf('>');
674                }
675                if (firstGT == -1) {
676                    if (c == -1) {
677                        throw new IOException("Unexpected end of XML stream");
678                    } else {
679                        throw new IOException(
680                                "XML prolog or ROOT element not found on first "
681                                        + offset + " bytes");
682                    }
683                }
684                int bytesRead = offset;
685                if (bytesRead > 0) {
686                    is.reset();
687                    BufferedReader bReader = new BufferedReader(new StringReader(
688                            xmlProlog.substring(0, firstGT + 1)));
689                    StringBuffer prolog = new StringBuffer();
690                    String line = bReader.readLine();
691                    while (line != null) {
692                        prolog.append(line);
693                        line = bReader.readLine();
694                    }
695                    Matcher m = ENCODING_PATTERN.matcher(prolog);
696                    if (m.find()) {
697                        encoding = m.group(1).toUpperCase();
698                        encoding = encoding.substring(1, encoding.length() - 1);
699                    }
700                }
701            }
702            return encoding;
703        }
704    
705        /**
706         * Indicates if the MIME type belongs to the APPLICATION XML family.
707         * 
708         * @param mime The mime type
709         * @return true if the mime type belongs to the APPLICATION XML family,
710         * otherwise false
711         */
712        static boolean isAppXml(String mime) {
713            return mime != null &&
714                   (mime.equals("application/xml") || 
715                    mime.equals("application/xml-dtd") ||
716                    mime.equals("application/xml-external-parsed-entity") ||
717                   (mime.startsWith("application/") && mime.endsWith("+xml")));
718        }
719    
720        /**
721         * Indicates if the MIME type belongs to the TEXT XML family.
722         * 
723         * @param mime The mime type
724         * @return true if the mime type belongs to the TEXT XML family,
725         * otherwise false
726         */
727        static boolean isTextXml(String mime) {
728            return mime != null &&
729                  (mime.equals("text/xml") ||
730                   mime.equals("text/xml-external-parsed-entity") ||
731                  (mime.startsWith("text/") && mime.endsWith("+xml")));
732        }
733    
734        private static final String RAW_EX_1 =
735            "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
736    
737        private static final String RAW_EX_2 =
738            "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";
739    
740        private static final String HTTP_EX_1 =
741            "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL";
742    
743        private static final String HTTP_EX_2 =
744            "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";
745    
746        private static final String HTTP_EX_3 =
747            "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME";
748    
749    }