001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.commons.io.input;
018
019 import java.io.BufferedInputStream;
020 import java.io.BufferedReader;
021 import java.io.File;
022 import java.io.FileInputStream;
023 import java.io.IOException;
024 import java.io.InputStream;
025 import java.io.InputStreamReader;
026 import java.io.Reader;
027 import java.io.StringReader;
028 import java.net.HttpURLConnection;
029 import java.net.URL;
030 import java.net.URLConnection;
031 import java.text.MessageFormat;
032 import java.util.Locale;
033 import java.util.regex.Matcher;
034 import java.util.regex.Pattern;
035
036 import org.apache.commons.io.ByteOrderMark;
037
038 /**
039 * Character stream that handles all the necessary Voodo to figure out the
040 * charset encoding of the XML document within the stream.
041 * <p>
042 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader.
043 * This one IS a character stream.
044 * <p>
045 * All this has to be done without consuming characters from the stream, if not
046 * the XML parser will not recognized the document as a valid XML. This is not
047 * 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers
048 * right now, XmlStreamReader handles it and things work in all parsers).
049 * <p>
050 * The XmlStreamReader class handles the charset encoding of XML documents in
051 * Files, raw streams and HTTP streams by offering a wide set of constructors.
052 * <p>
053 * By default the charset encoding detection is lenient, the constructor with
054 * the lenient flag can be used for an script (following HTTP MIME and XML
055 * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a
056 * href="http://diveintomark.org/archives/2004/02/13/xml-media-types">
057 * Determining the character encoding of a feed</a>.
058 * <p>
059 * Originally developed for <a href="http://rome.dev.java.net">ROME</a> under
060 * Apache License 2.0.
061 *
062 * @version $Id: XmlStreamReader.java 1346400 2012-06-05 14:48:01Z ggregory $
063 * @see org.apache.commons.io.output.XmlStreamWriter
064 * @since 2.0
065 */
066 public class XmlStreamReader extends Reader {
067 private static final int BUFFER_SIZE = 4096;
068
069 private static final String UTF_8 = "UTF-8";
070
071 private static final String US_ASCII = "US-ASCII";
072
073 private static final String UTF_16BE = "UTF-16BE";
074
075 private static final String UTF_16LE = "UTF-16LE";
076
077 private static final String UTF_32BE = "UTF-32BE";
078
079 private static final String UTF_32LE = "UTF-32LE";
080
081 private static final String UTF_16 = "UTF-16";
082
083 private static final String UTF_32 = "UTF-32";
084
085 private static final String EBCDIC = "CP1047";
086
087 private static final ByteOrderMark[] BOMS = new ByteOrderMark[] {
088 ByteOrderMark.UTF_8,
089 ByteOrderMark.UTF_16BE,
090 ByteOrderMark.UTF_16LE,
091 ByteOrderMark.UTF_32BE,
092 ByteOrderMark.UTF_32LE
093 };
094
095 // UTF_16LE and UTF_32LE have the same two starting BOM bytes.
096 private static final ByteOrderMark[] XML_GUESS_BYTES = new ByteOrderMark[] {
097 new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D),
098 new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F),
099 new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00),
100 new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C,
101 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D),
102 new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00,
103 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00),
104 new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94)
105 };
106
107 private final Reader reader;
108
109 private final String encoding;
110
111 private final String defaultEncoding;
112
113 /**
114 * Returns the default encoding to use if none is set in HTTP content-type,
115 * XML prolog and the rules based on content-type are not adequate.
116 * <p>
117 * If it is NULL the content-type based rules are used.
118 *
119 * @return the default encoding to use.
120 */
121 public String getDefaultEncoding() {
122 return defaultEncoding;
123 }
124
125 /**
126 * Creates a Reader for a File.
127 * <p>
128 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset,
129 * if this is also missing defaults to UTF-8.
130 * <p>
131 * It does a lenient charset encoding detection, check the constructor with
132 * the lenient parameter for details.
133 *
134 * @param file File to create a Reader from.
135 * @throws IOException thrown if there is a problem reading the file.
136 */
137 public XmlStreamReader(File file) throws IOException {
138 this(new FileInputStream(file));
139 }
140
141 /**
142 * Creates a Reader for a raw InputStream.
143 * <p>
144 * It follows the same logic used for files.
145 * <p>
146 * It does a lenient charset encoding detection, check the constructor with
147 * the lenient parameter for details.
148 *
149 * @param is InputStream to create a Reader from.
150 * @throws IOException thrown if there is a problem reading the stream.
151 */
152 public XmlStreamReader(InputStream is) throws IOException {
153 this(is, true);
154 }
155
156 /**
157 * Creates a Reader for a raw InputStream.
158 * <p>
159 * It follows the same logic used for files.
160 * <p>
161 * If lenient detection is indicated and the detection above fails as per
162 * specifications it then attempts the following:
163 * <p>
164 * If the content type was 'text/html' it replaces it with 'text/xml' and
165 * tries the detection again.
166 * <p>
167 * Else if the XML prolog had a charset encoding that encoding is used.
168 * <p>
169 * Else if the content type had a charset encoding that encoding is used.
170 * <p>
171 * Else 'UTF-8' is used.
172 * <p>
173 * If lenient detection is indicated an XmlStreamReaderException is never
174 * thrown.
175 *
176 * @param is InputStream to create a Reader from.
177 * @param lenient indicates if the charset encoding detection should be
178 * relaxed.
179 * @throws IOException thrown if there is a problem reading the stream.
180 * @throws XmlStreamReaderException thrown if the charset encoding could not
181 * be determined according to the specs.
182 */
183 public XmlStreamReader(InputStream is, boolean lenient) throws IOException {
184 this(is, lenient, null);
185 }
186
187 /**
188 * Creates a Reader for a raw InputStream.
189 * <p>
190 * It follows the same logic used for files.
191 * <p>
192 * If lenient detection is indicated and the detection above fails as per
193 * specifications it then attempts the following:
194 * <p>
195 * If the content type was 'text/html' it replaces it with 'text/xml' and
196 * tries the detection again.
197 * <p>
198 * Else if the XML prolog had a charset encoding that encoding is used.
199 * <p>
200 * Else if the content type had a charset encoding that encoding is used.
201 * <p>
202 * Else 'UTF-8' is used.
203 * <p>
204 * If lenient detection is indicated an XmlStreamReaderException is never
205 * thrown.
206 *
207 * @param is InputStream to create a Reader from.
208 * @param lenient indicates if the charset encoding detection should be
209 * relaxed.
210 * @param defaultEncoding The default encoding
211 * @throws IOException thrown if there is a problem reading the stream.
212 * @throws XmlStreamReaderException thrown if the charset encoding could not
213 * be determined according to the specs.
214 */
215 public XmlStreamReader(InputStream is, boolean lenient, String defaultEncoding) throws IOException {
216 this.defaultEncoding = defaultEncoding;
217 BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS);
218 BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
219 this.encoding = doRawStream(bom, pis, lenient);
220 this.reader = new InputStreamReader(pis, encoding);
221 }
222
223 /**
224 * Creates a Reader using the InputStream of a URL.
225 * <p>
226 * If the URL is not of type HTTP and there is not 'content-type' header in
227 * the fetched data it uses the same logic used for Files.
228 * <p>
229 * If the URL is a HTTP Url or there is a 'content-type' header in the
230 * fetched data it uses the same logic used for an InputStream with
231 * content-type.
232 * <p>
233 * It does a lenient charset encoding detection, check the constructor with
234 * the lenient parameter for details.
235 *
236 * @param url URL to create a Reader from.
237 * @throws IOException thrown if there is a problem reading the stream of
238 * the URL.
239 */
240 public XmlStreamReader(URL url) throws IOException {
241 this(url.openConnection(), null);
242 }
243
244 /**
245 * Creates a Reader using the InputStream of a URLConnection.
246 * <p>
247 * If the URLConnection is not of type HttpURLConnection and there is not
248 * 'content-type' header in the fetched data it uses the same logic used for
249 * files.
250 * <p>
251 * If the URLConnection is a HTTP Url or there is a 'content-type' header in
252 * the fetched data it uses the same logic used for an InputStream with
253 * content-type.
254 * <p>
255 * It does a lenient charset encoding detection, check the constructor with
256 * the lenient parameter for details.
257 *
258 * @param conn URLConnection to create a Reader from.
259 * @param defaultEncoding The default encoding
260 * @throws IOException thrown if there is a problem reading the stream of
261 * the URLConnection.
262 */
263 public XmlStreamReader(URLConnection conn, String defaultEncoding) throws IOException {
264 this.defaultEncoding = defaultEncoding;
265 boolean lenient = true;
266 String contentType = conn.getContentType();
267 InputStream is = conn.getInputStream();
268 BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS);
269 BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
270 if (conn instanceof HttpURLConnection || contentType != null) {
271 this.encoding = doHttpStream(bom, pis, contentType, lenient);
272 } else {
273 this.encoding = doRawStream(bom, pis, lenient);
274 }
275 this.reader = new InputStreamReader(pis, encoding);
276 }
277
278 /**
279 * Creates a Reader using an InputStream an the associated content-type
280 * header.
281 * <p>
282 * First it checks if the stream has BOM. If there is not BOM checks the
283 * content-type encoding. If there is not content-type encoding checks the
284 * XML prolog encoding. If there is not XML prolog encoding uses the default
285 * encoding mandated by the content-type MIME type.
286 * <p>
287 * It does a lenient charset encoding detection, check the constructor with
288 * the lenient parameter for details.
289 *
290 * @param is InputStream to create the reader from.
291 * @param httpContentType content-type header to use for the resolution of
292 * the charset encoding.
293 * @throws IOException thrown if there is a problem reading the file.
294 */
295 public XmlStreamReader(InputStream is, String httpContentType)
296 throws IOException {
297 this(is, httpContentType, true);
298 }
299
300 /**
301 * Creates a Reader using an InputStream an the associated content-type
302 * header. This constructor is lenient regarding the encoding detection.
303 * <p>
304 * First it checks if the stream has BOM. If there is not BOM checks the
305 * content-type encoding. If there is not content-type encoding checks the
306 * XML prolog encoding. If there is not XML prolog encoding uses the default
307 * encoding mandated by the content-type MIME type.
308 * <p>
309 * If lenient detection is indicated and the detection above fails as per
310 * specifications it then attempts the following:
311 * <p>
312 * If the content type was 'text/html' it replaces it with 'text/xml' and
313 * tries the detection again.
314 * <p>
315 * Else if the XML prolog had a charset encoding that encoding is used.
316 * <p>
317 * Else if the content type had a charset encoding that encoding is used.
318 * <p>
319 * Else 'UTF-8' is used.
320 * <p>
321 * If lenient detection is indicated an XmlStreamReaderException is never
322 * thrown.
323 *
324 * @param is InputStream to create the reader from.
325 * @param httpContentType content-type header to use for the resolution of
326 * the charset encoding.
327 * @param lenient indicates if the charset encoding detection should be
328 * relaxed.
329 * @param defaultEncoding The default encoding
330 * @throws IOException thrown if there is a problem reading the file.
331 * @throws XmlStreamReaderException thrown if the charset encoding could not
332 * be determined according to the specs.
333 */
334 public XmlStreamReader(InputStream is, String httpContentType,
335 boolean lenient, String defaultEncoding) throws IOException {
336 this.defaultEncoding = defaultEncoding;
337 BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS);
338 BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
339 this.encoding = doHttpStream(bom, pis, httpContentType, lenient);
340 this.reader = new InputStreamReader(pis, encoding);
341 }
342
343 /**
344 * Creates a Reader using an InputStream an the associated content-type
345 * header. This constructor is lenient regarding the encoding detection.
346 * <p>
347 * First it checks if the stream has BOM. If there is not BOM checks the
348 * content-type encoding. If there is not content-type encoding checks the
349 * XML prolog encoding. If there is not XML prolog encoding uses the default
350 * encoding mandated by the content-type MIME type.
351 * <p>
352 * If lenient detection is indicated and the detection above fails as per
353 * specifications it then attempts the following:
354 * <p>
355 * If the content type was 'text/html' it replaces it with 'text/xml' and
356 * tries the detection again.
357 * <p>
358 * Else if the XML prolog had a charset encoding that encoding is used.
359 * <p>
360 * Else if the content type had a charset encoding that encoding is used.
361 * <p>
362 * Else 'UTF-8' is used.
363 * <p>
364 * If lenient detection is indicated an XmlStreamReaderException is never
365 * thrown.
366 *
367 * @param is InputStream to create the reader from.
368 * @param httpContentType content-type header to use for the resolution of
369 * the charset encoding.
370 * @param lenient indicates if the charset encoding detection should be
371 * relaxed.
372 * @throws IOException thrown if there is a problem reading the file.
373 * @throws XmlStreamReaderException thrown if the charset encoding could not
374 * be determined according to the specs.
375 */
376 public XmlStreamReader(InputStream is, String httpContentType,
377 boolean lenient) throws IOException {
378 this(is, httpContentType, lenient, null);
379 }
380
381 /**
382 * Returns the charset encoding of the XmlStreamReader.
383 *
384 * @return charset encoding.
385 */
386 public String getEncoding() {
387 return encoding;
388 }
389
390 /**
391 * Invokes the underlying reader's <code>read(char[], int, int)</code> method.
392 * @param buf the buffer to read the characters into
393 * @param offset The start offset
394 * @param len The number of bytes to read
395 * @return the number of characters read or -1 if the end of stream
396 * @throws IOException if an I/O error occurs
397 */
398 @Override
399 public int read(char[] buf, int offset, int len) throws IOException {
400 return reader.read(buf, offset, len);
401 }
402
403 /**
404 * Closes the XmlStreamReader stream.
405 *
406 * @throws IOException thrown if there was a problem closing the stream.
407 */
408 @Override
409 public void close() throws IOException {
410 reader.close();
411 }
412
413 /**
414 * Process the raw stream.
415 *
416 * @param bom BOMInputStream to detect byte order marks
417 * @param pis BOMInputStream to guess XML encoding
418 * @param lenient indicates if the charset encoding detection should be
419 * relaxed.
420 * @return the encoding to be used
421 * @throws IOException thrown if there is a problem reading the stream.
422 */
423 private String doRawStream(BOMInputStream bom, BOMInputStream pis, boolean lenient)
424 throws IOException {
425 String bomEnc = bom.getBOMCharsetName();
426 String xmlGuessEnc = pis.getBOMCharsetName();
427 String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
428 try {
429 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
430 } catch (XmlStreamReaderException ex) {
431 if (lenient) {
432 return doLenientDetection(null, ex);
433 } else {
434 throw ex;
435 }
436 }
437 }
438
439 /**
440 * Process a HTTP stream.
441 *
442 * @param bom BOMInputStream to detect byte order marks
443 * @param pis BOMInputStream to guess XML encoding
444 * @param httpContentType The HTTP content type
445 * @param lenient indicates if the charset encoding detection should be
446 * relaxed.
447 * @return the encoding to be used
448 * @throws IOException thrown if there is a problem reading the stream.
449 */
450 private String doHttpStream(BOMInputStream bom, BOMInputStream pis, String httpContentType,
451 boolean lenient) throws IOException {
452 String bomEnc = bom.getBOMCharsetName();
453 String xmlGuessEnc = pis.getBOMCharsetName();
454 String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
455 try {
456 return calculateHttpEncoding(httpContentType, bomEnc,
457 xmlGuessEnc, xmlEnc, lenient);
458 } catch (XmlStreamReaderException ex) {
459 if (lenient) {
460 return doLenientDetection(httpContentType, ex);
461 } else {
462 throw ex;
463 }
464 }
465 }
466
467 /**
468 * Do lenient detection.
469 *
470 * @param httpContentType content-type header to use for the resolution of
471 * the charset encoding.
472 * @param ex The thrown exception
473 * @return the encoding
474 * @throws IOException thrown if there is a problem reading the stream.
475 */
476 private String doLenientDetection(String httpContentType,
477 XmlStreamReaderException ex) throws IOException {
478 if (httpContentType != null && httpContentType.startsWith("text/html")) {
479 httpContentType = httpContentType.substring("text/html".length());
480 httpContentType = "text/xml" + httpContentType;
481 try {
482 return calculateHttpEncoding(httpContentType, ex.getBomEncoding(),
483 ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true);
484 } catch (XmlStreamReaderException ex2) {
485 ex = ex2;
486 }
487 }
488 String encoding = ex.getXmlEncoding();
489 if (encoding == null) {
490 encoding = ex.getContentTypeEncoding();
491 }
492 if (encoding == null) {
493 encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
494 }
495 return encoding;
496 }
497
498 /**
499 * Calculate the raw encoding.
500 *
501 * @param bomEnc BOM encoding
502 * @param xmlGuessEnc XML Guess encoding
503 * @param xmlEnc XML encoding
504 * @return the raw encoding
505 * @throws IOException thrown if there is a problem reading the stream.
506 */
507 String calculateRawEncoding(String bomEnc, String xmlGuessEnc,
508 String xmlEnc) throws IOException {
509
510 // BOM is Null
511 if (bomEnc == null) {
512 if (xmlGuessEnc == null || xmlEnc == null) {
513 return defaultEncoding == null ? UTF_8 : defaultEncoding;
514 }
515 if (xmlEnc.equals(UTF_16) &&
516 (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
517 return xmlGuessEnc;
518 }
519 return xmlEnc;
520 }
521
522 // BOM is UTF-8
523 if (bomEnc.equals(UTF_8)) {
524 if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
525 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
526 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
527 }
528 if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
529 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
530 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
531 }
532 return bomEnc;
533 }
534
535 // BOM is UTF-16BE or UTF-16LE
536 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
537 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
538 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
539 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
540 }
541 if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
542 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
543 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
544 }
545 return bomEnc;
546 }
547
548 // BOM is UTF-32BE or UTF-32LE
549 if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) {
550 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
551 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
552 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
553 }
554 if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) {
555 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
556 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
557 }
558 return bomEnc;
559 }
560
561 // BOM is something else
562 String msg = MessageFormat.format(RAW_EX_2, new Object[] { bomEnc, xmlGuessEnc, xmlEnc });
563 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
564 }
565
566
567 /**
568 * Calculate the HTTP encoding.
569 *
570 * @param httpContentType The HTTP content type
571 * @param bomEnc BOM encoding
572 * @param xmlGuessEnc XML Guess encoding
573 * @param xmlEnc XML encoding
574 * @param lenient indicates if the charset encoding detection should be
575 * relaxed.
576 * @return the HTTP encoding
577 * @throws IOException thrown if there is a problem reading the stream.
578 */
579 String calculateHttpEncoding(String httpContentType,
580 String bomEnc, String xmlGuessEnc, String xmlEnc,
581 boolean lenient) throws IOException {
582
583 // Lenient and has XML encoding
584 if (lenient && xmlEnc != null) {
585 return xmlEnc;
586 }
587
588 // Determine mime/encoding content types from HTTP Content Type
589 String cTMime = getContentTypeMime(httpContentType);
590 String cTEnc = getContentTypeEncoding(httpContentType);
591 boolean appXml = isAppXml(cTMime);
592 boolean textXml = isTextXml(cTMime);
593
594 // Mime type NOT "application/xml" or "text/xml"
595 if (!appXml && !textXml) {
596 String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
597 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
598 }
599
600 // No content type encoding
601 if (cTEnc == null) {
602 if (appXml) {
603 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
604 } else {
605 return defaultEncoding == null ? US_ASCII : defaultEncoding;
606 }
607 }
608
609 // UTF-16BE or UTF-16LE content type encoding
610 if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
611 if (bomEnc != null) {
612 String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
613 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
614 }
615 return cTEnc;
616 }
617
618 // UTF-16 content type encoding
619 if (cTEnc.equals(UTF_16)) {
620 if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
621 return bomEnc;
622 }
623 String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
624 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
625 }
626
627 // UTF-32BE or UTF-132E content type encoding
628 if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) {
629 if (bomEnc != null) {
630 String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
631 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
632 }
633 return cTEnc;
634 }
635
636 // UTF-32 content type encoding
637 if (cTEnc.equals(UTF_32)) {
638 if (bomEnc != null && bomEnc.startsWith(UTF_32)) {
639 return bomEnc;
640 }
641 String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
642 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
643 }
644
645 return cTEnc;
646 }
647
648 /**
649 * Returns MIME type or NULL if httpContentType is NULL.
650 *
651 * @param httpContentType the HTTP content type
652 * @return The mime content type
653 */
654 static String getContentTypeMime(String httpContentType) {
655 String mime = null;
656 if (httpContentType != null) {
657 int i = httpContentType.indexOf(";");
658 if (i >= 0) {
659 mime = httpContentType.substring(0, i);
660 } else {
661 mime = httpContentType;
662 }
663 mime = mime.trim();
664 }
665 return mime;
666 }
667
668 private static final Pattern CHARSET_PATTERN = Pattern
669 .compile("charset=[\"']?([.[^; \"']]*)[\"']?");
670
671 /**
672 * Returns charset parameter value, NULL if not present, NULL if
673 * httpContentType is NULL.
674 *
675 * @param httpContentType the HTTP content type
676 * @return The content type encoding (upcased)
677 */
678 static String getContentTypeEncoding(String httpContentType) {
679 String encoding = null;
680 if (httpContentType != null) {
681 int i = httpContentType.indexOf(";");
682 if (i > -1) {
683 String postMime = httpContentType.substring(i + 1);
684 Matcher m = CHARSET_PATTERN.matcher(postMime);
685 encoding = m.find() ? m.group(1) : null;
686 encoding = encoding != null ? encoding.toUpperCase(Locale.US) : null;
687 }
688 }
689 return encoding;
690 }
691
692 public static final Pattern ENCODING_PATTERN = Pattern.compile(
693 "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))",
694 Pattern.MULTILINE);
695
696 /**
697 * Returns the encoding declared in the <?xml encoding=...?>, NULL if none.
698 *
699 * @param is InputStream to create the reader from.
700 * @param guessedEnc guessed encoding
701 * @return the encoding declared in the <?xml encoding=...?>
702 * @throws IOException thrown if there is a problem reading the stream.
703 */
704 private static String getXmlProlog(InputStream is, String guessedEnc)
705 throws IOException {
706 String encoding = null;
707 if (guessedEnc != null) {
708 byte[] bytes = new byte[BUFFER_SIZE];
709 is.mark(BUFFER_SIZE);
710 int offset = 0;
711 int max = BUFFER_SIZE;
712 int c = is.read(bytes, offset, max);
713 int firstGT = -1;
714 String xmlProlog = null;
715 while (c != -1 && firstGT == -1 && offset < BUFFER_SIZE) {
716 offset += c;
717 max -= c;
718 c = is.read(bytes, offset, max);
719 xmlProlog = new String(bytes, 0, offset, guessedEnc);
720 firstGT = xmlProlog.indexOf('>');
721 }
722 if (firstGT == -1) {
723 if (c == -1) {
724 throw new IOException("Unexpected end of XML stream");
725 } else {
726 throw new IOException(
727 "XML prolog or ROOT element not found on first "
728 + offset + " bytes");
729 }
730 }
731 int bytesRead = offset;
732 if (bytesRead > 0) {
733 is.reset();
734 BufferedReader bReader = new BufferedReader(new StringReader(
735 xmlProlog.substring(0, firstGT + 1)));
736 StringBuffer prolog = new StringBuffer();
737 String line = bReader.readLine();
738 while (line != null) {
739 prolog.append(line);
740 line = bReader.readLine();
741 }
742 Matcher m = ENCODING_PATTERN.matcher(prolog);
743 if (m.find()) {
744 encoding = m.group(1).toUpperCase();
745 encoding = encoding.substring(1, encoding.length() - 1);
746 }
747 }
748 }
749 return encoding;
750 }
751
752 /**
753 * Indicates if the MIME type belongs to the APPLICATION XML family.
754 *
755 * @param mime The mime type
756 * @return true if the mime type belongs to the APPLICATION XML family,
757 * otherwise false
758 */
759 static boolean isAppXml(String mime) {
760 return mime != null &&
761 (mime.equals("application/xml") ||
762 mime.equals("application/xml-dtd") ||
763 mime.equals("application/xml-external-parsed-entity") ||
764 mime.startsWith("application/") && mime.endsWith("+xml"));
765 }
766
767 /**
768 * Indicates if the MIME type belongs to the TEXT XML family.
769 *
770 * @param mime The mime type
771 * @return true if the mime type belongs to the TEXT XML family,
772 * otherwise false
773 */
774 static boolean isTextXml(String mime) {
775 return mime != null &&
776 (mime.equals("text/xml") ||
777 mime.equals("text/xml-external-parsed-entity") ||
778 mime.startsWith("text/") && mime.endsWith("+xml"));
779 }
780
781 private static final String RAW_EX_1 =
782 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
783
784 private static final String RAW_EX_2 =
785 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";
786
787 private static final String HTTP_EX_1 =
788 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL";
789
790 private static final String HTTP_EX_2 =
791 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";
792
793 private static final String HTTP_EX_3 =
794 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME";
795
796 }