001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.commons.io.input;
018
019 import java.io.BufferedInputStream;
020 import java.io.BufferedReader;
021 import java.io.File;
022 import java.io.FileInputStream;
023 import java.io.IOException;
024 import java.io.InputStream;
025 import java.io.InputStreamReader;
026 import java.io.Reader;
027 import java.io.StringReader;
028 import java.net.HttpURLConnection;
029 import java.net.URL;
030 import java.net.URLConnection;
031 import java.text.MessageFormat;
032 import java.util.regex.Matcher;
033 import java.util.regex.Pattern;
034
035 import org.apache.commons.io.ByteOrderMark;
036
037 /**
038 * Character stream that handles all the necessary Voodo to figure out the
039 * charset encoding of the XML document within the stream.
040 * <p>
041 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader.
042 * This one IS a character stream.
043 * <p>
044 * All this has to be done without consuming characters from the stream, if not
045 * the XML parser will not recognized the document as a valid XML. This is not
046 * 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers
047 * right now, XmlStreamReader handles it and things work in all parsers).
048 * <p>
049 * The XmlStreamReader class handles the charset encoding of XML documents in
050 * Files, raw streams and HTTP streams by offering a wide set of constructors.
051 * <p>
052 * By default the charset encoding detection is lenient, the constructor with
053 * the lenient flag can be used for an script (following HTTP MIME and XML
054 * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a
055 * href="http://diveintomark.org/archives/2004/02/13/xml-media-types">
056 * Determining the character encoding of a feed</a>.
057 * <p>
058 * Originally developed for <a href="http://rome.dev.java.net">ROME</a> under
059 * Apache License 2.0.
060 *
061 * @author Alejandro Abdelnur
062 * @version $Id: XmlStreamReader.java 1021884 2010-10-12 18:49:16Z ggregory $
063 * @see org.apache.commons.io.output.XmlStreamWriter
064 * @since Commons IO 2.0
065 */
066 public class XmlStreamReader extends Reader {
067 private static final int BUFFER_SIZE = 4096;
068
069 private static final String UTF_8 = "UTF-8";
070
071 private static final String US_ASCII = "US-ASCII";
072
073 private static final String UTF_16BE = "UTF-16BE";
074
075 private static final String UTF_16LE = "UTF-16LE";
076
077 private static final String UTF_16 = "UTF-16";
078
079 private static final String EBCDIC = "CP1047";
080
081 private static final ByteOrderMark[] BOMS = new ByteOrderMark[] {
082 ByteOrderMark.UTF_8,
083 ByteOrderMark.UTF_16BE,
084 ByteOrderMark.UTF_16LE
085 };
086 private static final ByteOrderMark[] XML_GUESS_BYTES = new ByteOrderMark[] {
087 new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D),
088 new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F),
089 new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00),
090 new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94)
091 };
092
093
094 private final Reader reader;
095
096 private final String encoding;
097
098 private final String defaultEncoding;
099
100 /**
101 * Returns the default encoding to use if none is set in HTTP content-type,
102 * XML prolog and the rules based on content-type are not adequate.
103 * <p>
104 * If it is NULL the content-type based rules are used.
105 *
106 * @return the default encoding to use.
107 */
108 public String getDefaultEncoding() {
109 return defaultEncoding;
110 }
111
112 /**
113 * Creates a Reader for a File.
114 * <p>
115 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset,
116 * if this is also missing defaults to UTF-8.
117 * <p>
118 * It does a lenient charset encoding detection, check the constructor with
119 * the lenient parameter for details.
120 *
121 * @param file File to create a Reader from.
122 * @throws IOException thrown if there is a problem reading the file.
123 */
124 public XmlStreamReader(File file) throws IOException {
125 this(new FileInputStream(file));
126 }
127
128 /**
129 * Creates a Reader for a raw InputStream.
130 * <p>
131 * It follows the same logic used for files.
132 * <p>
133 * It does a lenient charset encoding detection, check the constructor with
134 * the lenient parameter for details.
135 *
136 * @param is InputStream to create a Reader from.
137 * @throws IOException thrown if there is a problem reading the stream.
138 */
139 public XmlStreamReader(InputStream is) throws IOException {
140 this(is, true);
141 }
142
143 /**
144 * Creates a Reader for a raw InputStream.
145 * <p>
146 * It follows the same logic used for files.
147 * <p>
148 * If lenient detection is indicated and the detection above fails as per
149 * specifications it then attempts the following:
150 * <p>
151 * If the content type was 'text/html' it replaces it with 'text/xml' and
152 * tries the detection again.
153 * <p>
154 * Else if the XML prolog had a charset encoding that encoding is used.
155 * <p>
156 * Else if the content type had a charset encoding that encoding is used.
157 * <p>
158 * Else 'UTF-8' is used.
159 * <p>
160 * If lenient detection is indicated an XmlStreamReaderException is never
161 * thrown.
162 *
163 * @param is InputStream to create a Reader from.
164 * @param lenient indicates if the charset encoding detection should be
165 * relaxed.
166 * @throws IOException thrown if there is a problem reading the stream.
167 * @throws XmlStreamReaderException thrown if the charset encoding could not
168 * be determined according to the specs.
169 */
170 public XmlStreamReader(InputStream is, boolean lenient) throws IOException {
171 this(is, lenient, null);
172 }
173
174 /**
175 * Creates a Reader for a raw InputStream.
176 * <p>
177 * It follows the same logic used for files.
178 * <p>
179 * If lenient detection is indicated and the detection above fails as per
180 * specifications it then attempts the following:
181 * <p>
182 * If the content type was 'text/html' it replaces it with 'text/xml' and
183 * tries the detection again.
184 * <p>
185 * Else if the XML prolog had a charset encoding that encoding is used.
186 * <p>
187 * Else if the content type had a charset encoding that encoding is used.
188 * <p>
189 * Else 'UTF-8' is used.
190 * <p>
191 * If lenient detection is indicated an XmlStreamReaderException is never
192 * thrown.
193 *
194 * @param is InputStream to create a Reader from.
195 * @param lenient indicates if the charset encoding detection should be
196 * relaxed.
197 * @param defaultEncoding The default encoding
198 * @throws IOException thrown if there is a problem reading the stream.
199 * @throws XmlStreamReaderException thrown if the charset encoding could not
200 * be determined according to the specs.
201 */
202 public XmlStreamReader(InputStream is, boolean lenient, String defaultEncoding) throws IOException {
203 this.defaultEncoding = defaultEncoding;
204 this.encoding = doRawStream(is, lenient);
205 this.reader = new InputStreamReader(is, encoding);
206 }
207
208 /**
209 * Creates a Reader using the InputStream of a URL.
210 * <p>
211 * If the URL is not of type HTTP and there is not 'content-type' header in
212 * the fetched data it uses the same logic used for Files.
213 * <p>
214 * If the URL is a HTTP Url or there is a 'content-type' header in the
215 * fetched data it uses the same logic used for an InputStream with
216 * content-type.
217 * <p>
218 * It does a lenient charset encoding detection, check the constructor with
219 * the lenient parameter for details.
220 *
221 * @param url URL to create a Reader from.
222 * @throws IOException thrown if there is a problem reading the stream of
223 * the URL.
224 */
225 public XmlStreamReader(URL url) throws IOException {
226 this(url.openConnection(), null);
227 }
228
229 /**
230 * Creates a Reader using the InputStream of a URLConnection.
231 * <p>
232 * If the URLConnection is not of type HttpURLConnection and there is not
233 * 'content-type' header in the fetched data it uses the same logic used for
234 * files.
235 * <p>
236 * If the URLConnection is a HTTP Url or there is a 'content-type' header in
237 * the fetched data it uses the same logic used for an InputStream with
238 * content-type.
239 * <p>
240 * It does a lenient charset encoding detection, check the constructor with
241 * the lenient parameter for details.
242 *
243 * @param conn URLConnection to create a Reader from.
244 * @param defaultEncoding The default encoding
245 * @throws IOException thrown if there is a problem reading the stream of
246 * the URLConnection.
247 */
248 public XmlStreamReader(URLConnection conn, String defaultEncoding) throws IOException {
249 this.defaultEncoding = defaultEncoding;
250 boolean lenient = true;
251 String contentType = conn.getContentType();
252 InputStream is = conn.getInputStream();
253 if (conn instanceof HttpURLConnection || contentType != null) {
254 this.encoding = doHttpStream(is, contentType, lenient);
255 } else {
256 this.encoding = doRawStream(is, lenient);
257 }
258 this.reader = new InputStreamReader(is, encoding);
259 }
260
261 /**
262 * Creates a Reader using an InputStream an the associated content-type
263 * header.
264 * <p>
265 * First it checks if the stream has BOM. If there is not BOM checks the
266 * content-type encoding. If there is not content-type encoding checks the
267 * XML prolog encoding. If there is not XML prolog encoding uses the default
268 * encoding mandated by the content-type MIME type.
269 * <p>
270 * It does a lenient charset encoding detection, check the constructor with
271 * the lenient parameter for details.
272 *
273 * @param is InputStream to create the reader from.
274 * @param httpContentType content-type header to use for the resolution of
275 * the charset encoding.
276 * @throws IOException thrown if there is a problem reading the file.
277 */
278 public XmlStreamReader(InputStream is, String httpContentType)
279 throws IOException {
280 this(is, httpContentType, true);
281 }
282
283 /**
284 * Creates a Reader using an InputStream an the associated content-type
285 * header. This constructor is lenient regarding the encoding detection.
286 * <p>
287 * First it checks if the stream has BOM. If there is not BOM checks the
288 * content-type encoding. If there is not content-type encoding checks the
289 * XML prolog encoding. If there is not XML prolog encoding uses the default
290 * encoding mandated by the content-type MIME type.
291 * <p>
292 * If lenient detection is indicated and the detection above fails as per
293 * specifications it then attempts the following:
294 * <p>
295 * If the content type was 'text/html' it replaces it with 'text/xml' and
296 * tries the detection again.
297 * <p>
298 * Else if the XML prolog had a charset encoding that encoding is used.
299 * <p>
300 * Else if the content type had a charset encoding that encoding is used.
301 * <p>
302 * Else 'UTF-8' is used.
303 * <p>
304 * If lenient detection is indicated an XmlStreamReaderException is never
305 * thrown.
306 *
307 * @param is InputStream to create the reader from.
308 * @param httpContentType content-type header to use for the resolution of
309 * the charset encoding.
310 * @param lenient indicates if the charset encoding detection should be
311 * relaxed.
312 * @param defaultEncoding The default encoding
313 * @throws IOException thrown if there is a problem reading the file.
314 * @throws XmlStreamReaderException thrown if the charset encoding could not
315 * be determined according to the specs.
316 */
317 public XmlStreamReader(InputStream is, String httpContentType,
318 boolean lenient, String defaultEncoding) throws IOException {
319 this.defaultEncoding = defaultEncoding;
320 this.encoding = doHttpStream(is, httpContentType, lenient);
321 this.reader = new InputStreamReader(is, encoding);
322 }
323
324 /**
325 * Creates a Reader using an InputStream an the associated content-type
326 * header. This constructor is lenient regarding the encoding detection.
327 * <p>
328 * First it checks if the stream has BOM. If there is not BOM checks the
329 * content-type encoding. If there is not content-type encoding checks the
330 * XML prolog encoding. If there is not XML prolog encoding uses the default
331 * encoding mandated by the content-type MIME type.
332 * <p>
333 * If lenient detection is indicated and the detection above fails as per
334 * specifications it then attempts the following:
335 * <p>
336 * If the content type was 'text/html' it replaces it with 'text/xml' and
337 * tries the detection again.
338 * <p>
339 * Else if the XML prolog had a charset encoding that encoding is used.
340 * <p>
341 * Else if the content type had a charset encoding that encoding is used.
342 * <p>
343 * Else 'UTF-8' is used.
344 * <p>
345 * If lenient detection is indicated an XmlStreamReaderException is never
346 * thrown.
347 *
348 * @param is InputStream to create the reader from.
349 * @param httpContentType content-type header to use for the resolution of
350 * the charset encoding.
351 * @param lenient indicates if the charset encoding detection should be
352 * relaxed.
353 * @throws IOException thrown if there is a problem reading the file.
354 * @throws XmlStreamReaderException thrown if the charset encoding could not
355 * be determined according to the specs.
356 */
357 public XmlStreamReader(InputStream is, String httpContentType,
358 boolean lenient) throws IOException {
359 this(is, httpContentType, lenient, null);
360 }
361
362 /**
363 * Returns the charset encoding of the XmlStreamReader.
364 *
365 * @return charset encoding.
366 */
367 public String getEncoding() {
368 return encoding;
369 }
370
371 /**
372 * Invokes the underlying reader's <code>read(char[], int, int)</code> method.
373 * @param buf the buffer to read the characters into
374 * @param offset The start offset
375 * @param len The number of bytes to read
376 * @return the number of characters read or -1 if the end of stream
377 * @throws IOException if an I/O error occurs
378 */
379 @Override
380 public int read(char[] buf, int offset, int len) throws IOException {
381 return reader.read(buf, offset, len);
382 }
383
384 /**
385 * Closes the XmlStreamReader stream.
386 *
387 * @throws IOException thrown if there was a problem closing the stream.
388 */
389 @Override
390 public void close() throws IOException {
391 reader.close();
392 }
393
394 /**
395 * Process the raw stream.
396 *
397 * @param is InputStream to create the reader from.
398 * @param lenient indicates if the charset encoding detection should be
399 * relaxed.
400 * @return the encoding to be used
401 * @throws IOException thrown if there is a problem reading the stream.
402 */
403 private String doRawStream(InputStream is, boolean lenient)
404 throws IOException {
405 BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS);
406 BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
407 String bomEnc = bom.getBOMCharsetName();
408 String xmlGuessEnc = pis.getBOMCharsetName();
409 String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
410 try {
411 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
412 } catch (XmlStreamReaderException ex) {
413 if (lenient) {
414 return doLenientDetection(null, is, ex);
415 } else {
416 throw ex;
417 }
418 }
419 }
420
421 /**
422 * Process a HTTP stream.
423 *
424 * @param is InputStream to create the reader from.
425 * @param httpContentType The HTTP content type
426 * @param lenient indicates if the charset encoding detection should be
427 * relaxed.
428 * @return the encoding to be used
429 * @throws IOException thrown if there is a problem reading the stream.
430 */
431 private String doHttpStream(InputStream is, String httpContentType,
432 boolean lenient) throws IOException {
433 BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS);
434 BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
435 String bomEnc = bom.getBOMCharsetName();
436 String xmlGuessEnc = pis.getBOMCharsetName();
437 String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
438 try {
439 return calculateHttpEncoding(httpContentType, bomEnc,
440 xmlGuessEnc, xmlEnc, lenient);
441 } catch (XmlStreamReaderException ex) {
442 if (lenient) {
443 return doLenientDetection(httpContentType, is, ex);
444 } else {
445 throw ex;
446 }
447 }
448 }
449
450 /**
451 * Do lenient detection.
452 *
453 * @param httpContentType content-type header to use for the resolution of
454 * the charset encoding.
455 * @param is the unconsumed InputStream
456 * @param ex The thrown exception
457 * @return the encoding
458 * @throws IOException thrown if there is a problem reading the stream.
459 */
460 private String doLenientDetection(String httpContentType, InputStream is,
461 XmlStreamReaderException ex) throws IOException {
462 if (httpContentType != null && httpContentType.startsWith("text/html")) {
463 httpContentType = httpContentType.substring("text/html".length());
464 httpContentType = "text/xml" + httpContentType;
465 try {
466 return calculateHttpEncoding(httpContentType, ex.getBomEncoding(),
467 ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true);
468 } catch (XmlStreamReaderException ex2) {
469 ex = ex2;
470 }
471 }
472 String encoding = ex.getXmlEncoding();
473 if (encoding == null) {
474 encoding = ex.getContentTypeEncoding();
475 }
476 if (encoding == null) {
477 encoding = (defaultEncoding == null) ? UTF_8 : defaultEncoding;
478 }
479 return encoding;
480 }
481
482 /**
483 * Calculate the raw encoding.
484 *
485 * @param bomEnc BOM encoding
486 * @param xmlGuessEnc XML Guess encoding
487 * @param xmlEnc XML encoding
488 * @return the raw encoding
489 * @throws IOException thrown if there is a problem reading the stream.
490 */
491 String calculateRawEncoding(String bomEnc, String xmlGuessEnc,
492 String xmlEnc) throws IOException {
493
494 // BOM is Null
495 if (bomEnc == null) {
496 if (xmlGuessEnc == null || xmlEnc == null) {
497 return (defaultEncoding == null ? UTF_8 : defaultEncoding);
498 }
499 if (xmlEnc.equals(UTF_16) &&
500 (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
501 return xmlGuessEnc;
502 }
503 return xmlEnc;
504 }
505
506 // BOM is UTF-8
507 if (bomEnc.equals(UTF_8)) {
508 if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
509 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
510 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
511 }
512 if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
513 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
514 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
515 }
516 return bomEnc;
517 }
518
519 // BOM is UTF-16BE or UTF-16LE
520 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
521 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
522 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
523 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
524 }
525 if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
526 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
527 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
528 }
529 return bomEnc;
530 }
531
532 // BOM is something else
533 String msg = MessageFormat.format(RAW_EX_2, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
534 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
535 }
536
537
538 /**
539 * Calculate the HTTP encoding.
540 *
541 * @param httpContentType The HTTP content type
542 * @param bomEnc BOM encoding
543 * @param xmlGuessEnc XML Guess encoding
544 * @param xmlEnc XML encoding
545 * @param lenient indicates if the charset encoding detection should be
546 * relaxed.
547 * @return the HTTP encoding
548 * @throws IOException thrown if there is a problem reading the stream.
549 */
550 String calculateHttpEncoding(String httpContentType,
551 String bomEnc, String xmlGuessEnc, String xmlEnc,
552 boolean lenient) throws IOException {
553
554 // Lenient and has XML encoding
555 if (lenient && xmlEnc != null) {
556 return xmlEnc;
557 }
558
559 // Determine mime/encoding content types from HTTP Content Type
560 String cTMime = getContentTypeMime(httpContentType);
561 String cTEnc = getContentTypeEncoding(httpContentType);
562 boolean appXml = isAppXml(cTMime);
563 boolean textXml = isTextXml(cTMime);
564
565 // Mime type NOT "application/xml" or "text/xml"
566 if (!appXml && !textXml) {
567 String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
568 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
569 }
570
571 // No content type encoding
572 if (cTEnc == null) {
573 if (appXml) {
574 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
575 } else {
576 return (defaultEncoding == null) ? US_ASCII : defaultEncoding;
577 }
578 }
579
580 // UTF-16BE or UTF-16LE content type encoding
581 if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
582 if (bomEnc != null) {
583 String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
584 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
585 }
586 return cTEnc;
587 }
588
589 // UTF-16 content type encoding
590 if (cTEnc.equals(UTF_16)) {
591 if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
592 return bomEnc;
593 }
594 String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
595 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
596 }
597
598 return cTEnc;
599 }
600
601 /**
602 * Returns MIME type or NULL if httpContentType is NULL.
603 *
604 * @param httpContentType the HTTP content type
605 * @return The mime content type
606 */
607 static String getContentTypeMime(String httpContentType) {
608 String mime = null;
609 if (httpContentType != null) {
610 int i = httpContentType.indexOf(";");
611 if (i >= 0) {
612 mime = httpContentType.substring(0, i);
613 } else {
614 mime = httpContentType;
615 }
616 mime = mime.trim();
617 }
618 return mime;
619 }
620
621 private static final Pattern CHARSET_PATTERN = Pattern
622 .compile("charset=[\"']?([.[^; \"']]*)[\"']?");
623
624 /**
625 * Returns charset parameter value, NULL if not present, NULL if
626 * httpContentType is NULL.
627 *
628 * @param httpContentType the HTTP content type
629 * @return The content type encoding
630 */
631 static String getContentTypeEncoding(String httpContentType) {
632 String encoding = null;
633 if (httpContentType != null) {
634 int i = httpContentType.indexOf(";");
635 if (i > -1) {
636 String postMime = httpContentType.substring(i + 1);
637 Matcher m = CHARSET_PATTERN.matcher(postMime);
638 encoding = (m.find()) ? m.group(1) : null;
639 encoding = (encoding != null) ? encoding.toUpperCase() : null;
640 }
641 }
642 return encoding;
643 }
644
645 public static final Pattern ENCODING_PATTERN = Pattern.compile(
646 "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))",
647 Pattern.MULTILINE);
648
649 /**
650 * Returns the encoding declared in the <?xml encoding=...?>, NULL if none.
651 *
652 * @param is InputStream to create the reader from.
653 * @param guessedEnc guessed encoding
654 * @return the encoding declared in the <?xml encoding=...?>
655 * @throws IOException thrown if there is a problem reading the stream.
656 */
657 private static String getXmlProlog(InputStream is, String guessedEnc)
658 throws IOException {
659 String encoding = null;
660 if (guessedEnc != null) {
661 byte[] bytes = new byte[BUFFER_SIZE];
662 is.mark(BUFFER_SIZE);
663 int offset = 0;
664 int max = BUFFER_SIZE;
665 int c = is.read(bytes, offset, max);
666 int firstGT = -1;
667 String xmlProlog = null;
668 while (c != -1 && firstGT == -1 && offset < BUFFER_SIZE) {
669 offset += c;
670 max -= c;
671 c = is.read(bytes, offset, max);
672 xmlProlog = new String(bytes, 0, offset, guessedEnc);
673 firstGT = xmlProlog.indexOf('>');
674 }
675 if (firstGT == -1) {
676 if (c == -1) {
677 throw new IOException("Unexpected end of XML stream");
678 } else {
679 throw new IOException(
680 "XML prolog or ROOT element not found on first "
681 + offset + " bytes");
682 }
683 }
684 int bytesRead = offset;
685 if (bytesRead > 0) {
686 is.reset();
687 BufferedReader bReader = new BufferedReader(new StringReader(
688 xmlProlog.substring(0, firstGT + 1)));
689 StringBuffer prolog = new StringBuffer();
690 String line = bReader.readLine();
691 while (line != null) {
692 prolog.append(line);
693 line = bReader.readLine();
694 }
695 Matcher m = ENCODING_PATTERN.matcher(prolog);
696 if (m.find()) {
697 encoding = m.group(1).toUpperCase();
698 encoding = encoding.substring(1, encoding.length() - 1);
699 }
700 }
701 }
702 return encoding;
703 }
704
705 /**
706 * Indicates if the MIME type belongs to the APPLICATION XML family.
707 *
708 * @param mime The mime type
709 * @return true if the mime type belongs to the APPLICATION XML family,
710 * otherwise false
711 */
712 static boolean isAppXml(String mime) {
713 return mime != null &&
714 (mime.equals("application/xml") ||
715 mime.equals("application/xml-dtd") ||
716 mime.equals("application/xml-external-parsed-entity") ||
717 (mime.startsWith("application/") && mime.endsWith("+xml")));
718 }
719
720 /**
721 * Indicates if the MIME type belongs to the TEXT XML family.
722 *
723 * @param mime The mime type
724 * @return true if the mime type belongs to the TEXT XML family,
725 * otherwise false
726 */
727 static boolean isTextXml(String mime) {
728 return mime != null &&
729 (mime.equals("text/xml") ||
730 mime.equals("text/xml-external-parsed-entity") ||
731 (mime.startsWith("text/") && mime.endsWith("+xml")));
732 }
733
734 private static final String RAW_EX_1 =
735 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
736
737 private static final String RAW_EX_2 =
738 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";
739
740 private static final String HTTP_EX_1 =
741 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL";
742
743 private static final String HTTP_EX_2 =
744 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";
745
746 private static final String HTTP_EX_3 =
747 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME";
748
749 }