001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.io.input; 018 019import java.io.BufferedInputStream; 020import java.io.BufferedReader; 021import java.io.File; 022import java.io.FileInputStream; 023import java.io.IOException; 024import java.io.InputStream; 025import java.io.InputStreamReader; 026import java.io.Reader; 027import java.io.StringReader; 028import java.net.HttpURLConnection; 029import java.net.URL; 030import java.net.URLConnection; 031import java.text.MessageFormat; 032import java.util.Locale; 033import java.util.regex.Matcher; 034import java.util.regex.Pattern; 035 036import org.apache.commons.io.ByteOrderMark; 037 038/** 039 * Character stream that handles all the necessary Voodo to figure out the 040 * charset encoding of the XML document within the stream. 041 * <p> 042 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. 043 * This one IS a character stream. 044 * <p> 045 * All this has to be done without consuming characters from the stream, if not 046 * the XML parser will not recognized the document as a valid XML. This is not 047 * 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers 048 * right now, XmlStreamReader handles it and things work in all parsers). 049 * <p> 050 * The XmlStreamReader class handles the charset encoding of XML documents in 051 * Files, raw streams and HTTP streams by offering a wide set of constructors. 052 * <p> 053 * By default the charset encoding detection is lenient, the constructor with 054 * the lenient flag can be used for an script (following HTTP MIME and XML 055 * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a 056 * href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> 057 * Determining the character encoding of a feed</a>. 058 * <p> 059 * Originally developed for <a href="http://rome.dev.java.net">ROME</a> under 060 * Apache License 2.0. 061 * 062 * @version $Id: XmlStreamReader.java 1686747 2015-06-21 18:44:49Z krosenvold $ 063 * @see org.apache.commons.io.output.XmlStreamWriter 064 * @since 2.0 065 */ 066public class XmlStreamReader extends Reader { 067 private static final int BUFFER_SIZE = 4096; 068 069 private static final String UTF_8 = "UTF-8"; 070 071 private static final String US_ASCII = "US-ASCII"; 072 073 private static final String UTF_16BE = "UTF-16BE"; 074 075 private static final String UTF_16LE = "UTF-16LE"; 076 077 private static final String UTF_32BE = "UTF-32BE"; 078 079 private static final String UTF_32LE = "UTF-32LE"; 080 081 private static final String UTF_16 = "UTF-16"; 082 083 private static final String UTF_32 = "UTF-32"; 084 085 private static final String EBCDIC = "CP1047"; 086 087 private static final ByteOrderMark[] BOMS = new ByteOrderMark[] { 088 ByteOrderMark.UTF_8, 089 ByteOrderMark.UTF_16BE, 090 ByteOrderMark.UTF_16LE, 091 ByteOrderMark.UTF_32BE, 092 ByteOrderMark.UTF_32LE 093 }; 094 095 // UTF_16LE and UTF_32LE have the same two starting BOM bytes. 096 private static final ByteOrderMark[] XML_GUESS_BYTES = new ByteOrderMark[] { 097 new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D), 098 new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), 099 new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00), 100 new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C, 101 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D), 102 new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00, 103 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00), 104 new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) 105 }; 106 107 private final Reader reader; 108 109 private final String encoding; 110 111 private final String defaultEncoding; 112 113 /** 114 * Returns the default encoding to use if none is set in HTTP content-type, 115 * XML prolog and the rules based on content-type are not adequate. 116 * <p> 117 * If it is NULL the content-type based rules are used. 118 * 119 * @return the default encoding to use. 120 */ 121 public String getDefaultEncoding() { 122 return defaultEncoding; 123 } 124 125 /** 126 * Creates a Reader for a File. 127 * <p> 128 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, 129 * if this is also missing defaults to UTF-8. 130 * <p> 131 * It does a lenient charset encoding detection, check the constructor with 132 * the lenient parameter for details. 133 * 134 * @param file File to create a Reader from. 135 * @throws IOException thrown if there is a problem reading the file. 136 */ 137 public XmlStreamReader(final File file) throws IOException { 138 this(new FileInputStream(file)); 139 } 140 141 /** 142 * Creates a Reader for a raw InputStream. 143 * <p> 144 * It follows the same logic used for files. 145 * <p> 146 * It does a lenient charset encoding detection, check the constructor with 147 * the lenient parameter for details. 148 * 149 * @param is InputStream to create a Reader from. 150 * @throws IOException thrown if there is a problem reading the stream. 151 */ 152 public XmlStreamReader(final InputStream is) throws IOException { 153 this(is, true); 154 } 155 156 /** 157 * Creates a Reader for a raw InputStream. 158 * <p> 159 * It follows the same logic used for files. 160 * <p> 161 * If lenient detection is indicated and the detection above fails as per 162 * specifications it then attempts the following: 163 * <p> 164 * If the content type was 'text/html' it replaces it with 'text/xml' and 165 * tries the detection again. 166 * <p> 167 * Else if the XML prolog had a charset encoding that encoding is used. 168 * <p> 169 * Else if the content type had a charset encoding that encoding is used. 170 * <p> 171 * Else 'UTF-8' is used. 172 * <p> 173 * If lenient detection is indicated an XmlStreamReaderException is never 174 * thrown. 175 * 176 * @param is InputStream to create a Reader from. 177 * @param lenient indicates if the charset encoding detection should be 178 * relaxed. 179 * @throws IOException thrown if there is a problem reading the stream. 180 * @throws XmlStreamReaderException thrown if the charset encoding could not 181 * be determined according to the specs. 182 */ 183 public XmlStreamReader(final InputStream is, final boolean lenient) throws IOException { 184 this(is, lenient, null); 185 } 186 187 /** 188 * Creates a Reader for a raw InputStream. 189 * <p> 190 * It follows the same logic used for files. 191 * <p> 192 * If lenient detection is indicated and the detection above fails as per 193 * specifications it then attempts the following: 194 * <p> 195 * If the content type was 'text/html' it replaces it with 'text/xml' and 196 * tries the detection again. 197 * <p> 198 * Else if the XML prolog had a charset encoding that encoding is used. 199 * <p> 200 * Else if the content type had a charset encoding that encoding is used. 201 * <p> 202 * Else 'UTF-8' is used. 203 * <p> 204 * If lenient detection is indicated an XmlStreamReaderException is never 205 * thrown. 206 * 207 * @param is InputStream to create a Reader from. 208 * @param lenient indicates if the charset encoding detection should be 209 * relaxed. 210 * @param defaultEncoding The default encoding 211 * @throws IOException thrown if there is a problem reading the stream. 212 * @throws XmlStreamReaderException thrown if the charset encoding could not 213 * be determined according to the specs. 214 */ 215 public XmlStreamReader(final InputStream is, final boolean lenient, final String defaultEncoding) 216 throws IOException { 217 this.defaultEncoding = defaultEncoding; 218 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS); 219 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 220 this.encoding = doRawStream(bom, pis, lenient); 221 this.reader = new InputStreamReader(pis, encoding); 222 } 223 224 /** 225 * Creates a Reader using the InputStream of a URL. 226 * <p> 227 * If the URL is not of type HTTP and there is not 'content-type' header in 228 * the fetched data it uses the same logic used for Files. 229 * <p> 230 * If the URL is a HTTP Url or there is a 'content-type' header in the 231 * fetched data it uses the same logic used for an InputStream with 232 * content-type. 233 * <p> 234 * It does a lenient charset encoding detection, check the constructor with 235 * the lenient parameter for details. 236 * 237 * @param url URL to create a Reader from. 238 * @throws IOException thrown if there is a problem reading the stream of 239 * the URL. 240 */ 241 public XmlStreamReader(final URL url) throws IOException { 242 this(url.openConnection(), null); 243 } 244 245 /** 246 * Creates a Reader using the InputStream of a URLConnection. 247 * <p> 248 * If the URLConnection is not of type HttpURLConnection and there is not 249 * 'content-type' header in the fetched data it uses the same logic used for 250 * files. 251 * <p> 252 * If the URLConnection is a HTTP Url or there is a 'content-type' header in 253 * the fetched data it uses the same logic used for an InputStream with 254 * content-type. 255 * <p> 256 * It does a lenient charset encoding detection, check the constructor with 257 * the lenient parameter for details. 258 * 259 * @param conn URLConnection to create a Reader from. 260 * @param defaultEncoding The default encoding 261 * @throws IOException thrown if there is a problem reading the stream of 262 * the URLConnection. 263 */ 264 public XmlStreamReader(final URLConnection conn, final String defaultEncoding) throws IOException { 265 this.defaultEncoding = defaultEncoding; 266 final boolean lenient = true; 267 final String contentType = conn.getContentType(); 268 final InputStream is = conn.getInputStream(); 269 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS); 270 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 271 if (conn instanceof HttpURLConnection || contentType != null) { 272 this.encoding = doHttpStream(bom, pis, contentType, lenient); 273 } else { 274 this.encoding = doRawStream(bom, pis, lenient); 275 } 276 this.reader = new InputStreamReader(pis, encoding); 277 } 278 279 /** 280 * Creates a Reader using an InputStream an the associated content-type 281 * header. 282 * <p> 283 * First it checks if the stream has BOM. If there is not BOM checks the 284 * content-type encoding. If there is not content-type encoding checks the 285 * XML prolog encoding. If there is not XML prolog encoding uses the default 286 * encoding mandated by the content-type MIME type. 287 * <p> 288 * It does a lenient charset encoding detection, check the constructor with 289 * the lenient parameter for details. 290 * 291 * @param is InputStream to create the reader from. 292 * @param httpContentType content-type header to use for the resolution of 293 * the charset encoding. 294 * @throws IOException thrown if there is a problem reading the file. 295 */ 296 public XmlStreamReader(final InputStream is, final String httpContentType) 297 throws IOException { 298 this(is, httpContentType, true); 299 } 300 301 /** 302 * Creates a Reader using an InputStream an the associated content-type 303 * header. This constructor is lenient regarding the encoding detection. 304 * <p> 305 * First it checks if the stream has BOM. If there is not BOM checks the 306 * content-type encoding. If there is not content-type encoding checks the 307 * XML prolog encoding. If there is not XML prolog encoding uses the default 308 * encoding mandated by the content-type MIME type. 309 * <p> 310 * If lenient detection is indicated and the detection above fails as per 311 * specifications it then attempts the following: 312 * <p> 313 * If the content type was 'text/html' it replaces it with 'text/xml' and 314 * tries the detection again. 315 * <p> 316 * Else if the XML prolog had a charset encoding that encoding is used. 317 * <p> 318 * Else if the content type had a charset encoding that encoding is used. 319 * <p> 320 * Else 'UTF-8' is used. 321 * <p> 322 * If lenient detection is indicated an XmlStreamReaderException is never 323 * thrown. 324 * 325 * @param is InputStream to create the reader from. 326 * @param httpContentType content-type header to use for the resolution of 327 * the charset encoding. 328 * @param lenient indicates if the charset encoding detection should be 329 * relaxed. 330 * @param defaultEncoding The default encoding 331 * @throws IOException thrown if there is a problem reading the file. 332 * @throws XmlStreamReaderException thrown if the charset encoding could not 333 * be determined according to the specs. 334 */ 335 public XmlStreamReader(final InputStream is, final String httpContentType, 336 final boolean lenient, final String defaultEncoding) throws IOException { 337 this.defaultEncoding = defaultEncoding; 338 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS); 339 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 340 this.encoding = doHttpStream(bom, pis, httpContentType, lenient); 341 this.reader = new InputStreamReader(pis, encoding); 342 } 343 344 /** 345 * Creates a Reader using an InputStream an the associated content-type 346 * header. This constructor is lenient regarding the encoding detection. 347 * <p> 348 * First it checks if the stream has BOM. If there is not BOM checks the 349 * content-type encoding. If there is not content-type encoding checks the 350 * XML prolog encoding. If there is not XML prolog encoding uses the default 351 * encoding mandated by the content-type MIME type. 352 * <p> 353 * If lenient detection is indicated and the detection above fails as per 354 * specifications it then attempts the following: 355 * <p> 356 * If the content type was 'text/html' it replaces it with 'text/xml' and 357 * tries the detection again. 358 * <p> 359 * Else if the XML prolog had a charset encoding that encoding is used. 360 * <p> 361 * Else if the content type had a charset encoding that encoding is used. 362 * <p> 363 * Else 'UTF-8' is used. 364 * <p> 365 * If lenient detection is indicated an XmlStreamReaderException is never 366 * thrown. 367 * 368 * @param is InputStream to create the reader from. 369 * @param httpContentType content-type header to use for the resolution of 370 * the charset encoding. 371 * @param lenient indicates if the charset encoding detection should be 372 * relaxed. 373 * @throws IOException thrown if there is a problem reading the file. 374 * @throws XmlStreamReaderException thrown if the charset encoding could not 375 * be determined according to the specs. 376 */ 377 public XmlStreamReader(final InputStream is, final String httpContentType, 378 final boolean lenient) throws IOException { 379 this(is, httpContentType, lenient, null); 380 } 381 382 /** 383 * Returns the charset encoding of the XmlStreamReader. 384 * 385 * @return charset encoding. 386 */ 387 public String getEncoding() { 388 return encoding; 389 } 390 391 /** 392 * Invokes the underlying reader's <code>read(char[], int, int)</code> method. 393 * @param buf the buffer to read the characters into 394 * @param offset The start offset 395 * @param len The number of bytes to read 396 * @return the number of characters read or -1 if the end of stream 397 * @throws IOException if an I/O error occurs 398 */ 399 @Override 400 public int read(final char[] buf, final int offset, final int len) throws IOException { 401 return reader.read(buf, offset, len); 402 } 403 404 /** 405 * Closes the XmlStreamReader stream. 406 * 407 * @throws IOException thrown if there was a problem closing the stream. 408 */ 409 @Override 410 public void close() throws IOException { 411 reader.close(); 412 } 413 414 /** 415 * Process the raw stream. 416 * 417 * @param bom BOMInputStream to detect byte order marks 418 * @param pis BOMInputStream to guess XML encoding 419 * @param lenient indicates if the charset encoding detection should be 420 * relaxed. 421 * @return the encoding to be used 422 * @throws IOException thrown if there is a problem reading the stream. 423 */ 424 private String doRawStream(final BOMInputStream bom, final BOMInputStream pis, final boolean lenient) 425 throws IOException { 426 final String bomEnc = bom.getBOMCharsetName(); 427 final String xmlGuessEnc = pis.getBOMCharsetName(); 428 final String xmlEnc = getXmlProlog(pis, xmlGuessEnc); 429 try { 430 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); 431 } catch (final XmlStreamReaderException ex) { 432 if (lenient) { 433 return doLenientDetection(null, ex); 434 } else { 435 throw ex; 436 } 437 } 438 } 439 440 /** 441 * Process a HTTP stream. 442 * 443 * @param bom BOMInputStream to detect byte order marks 444 * @param pis BOMInputStream to guess XML encoding 445 * @param httpContentType The HTTP content type 446 * @param lenient indicates if the charset encoding detection should be 447 * relaxed. 448 * @return the encoding to be used 449 * @throws IOException thrown if there is a problem reading the stream. 450 */ 451 private String doHttpStream(final BOMInputStream bom, final BOMInputStream pis, final String httpContentType, 452 final boolean lenient) throws IOException { 453 final String bomEnc = bom.getBOMCharsetName(); 454 final String xmlGuessEnc = pis.getBOMCharsetName(); 455 final String xmlEnc = getXmlProlog(pis, xmlGuessEnc); 456 try { 457 return calculateHttpEncoding(httpContentType, bomEnc, 458 xmlGuessEnc, xmlEnc, lenient); 459 } catch (final XmlStreamReaderException ex) { 460 if (lenient) { 461 return doLenientDetection(httpContentType, ex); 462 } else { 463 throw ex; 464 } 465 } 466 } 467 468 /** 469 * Do lenient detection. 470 * 471 * @param httpContentType content-type header to use for the resolution of 472 * the charset encoding. 473 * @param ex The thrown exception 474 * @return the encoding 475 * @throws IOException thrown if there is a problem reading the stream. 476 */ 477 private String doLenientDetection(String httpContentType, 478 XmlStreamReaderException ex) throws IOException { 479 if (httpContentType != null && httpContentType.startsWith("text/html")) { 480 httpContentType = httpContentType.substring("text/html".length()); 481 httpContentType = "text/xml" + httpContentType; 482 try { 483 return calculateHttpEncoding(httpContentType, ex.getBomEncoding(), 484 ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true); 485 } catch (final XmlStreamReaderException ex2) { 486 ex = ex2; 487 } 488 } 489 String encoding = ex.getXmlEncoding(); 490 if (encoding == null) { 491 encoding = ex.getContentTypeEncoding(); 492 } 493 if (encoding == null) { 494 encoding = defaultEncoding == null ? UTF_8 : defaultEncoding; 495 } 496 return encoding; 497 } 498 499 /** 500 * Calculate the raw encoding. 501 * 502 * @param bomEnc BOM encoding 503 * @param xmlGuessEnc XML Guess encoding 504 * @param xmlEnc XML encoding 505 * @return the raw encoding 506 * @throws IOException thrown if there is a problem reading the stream. 507 */ 508 String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc, 509 final String xmlEnc) throws IOException { 510 511 // BOM is Null 512 if (bomEnc == null) { 513 if (xmlGuessEnc == null || xmlEnc == null) { 514 return defaultEncoding == null ? UTF_8 : defaultEncoding; 515 } 516 if (xmlEnc.equals(UTF_16) && 517 (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) { 518 return xmlGuessEnc; 519 } 520 return xmlEnc; 521 } 522 523 // BOM is UTF-8 524 if (bomEnc.equals(UTF_8)) { 525 if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) { 526 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 527 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 528 } 529 if (xmlEnc != null && !xmlEnc.equals(UTF_8)) { 530 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 531 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 532 } 533 return bomEnc; 534 } 535 536 // BOM is UTF-16BE or UTF-16LE 537 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) { 538 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) { 539 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 540 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 541 } 542 if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) { 543 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 544 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 545 } 546 return bomEnc; 547 } 548 549 // BOM is UTF-32BE or UTF-32LE 550 if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) { 551 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) { 552 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 553 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 554 } 555 if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) { 556 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 557 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 558 } 559 return bomEnc; 560 } 561 562 // BOM is something else 563 final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc); 564 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 565 } 566 567 568 /** 569 * Calculate the HTTP encoding. 570 * 571 * @param httpContentType The HTTP content type 572 * @param bomEnc BOM encoding 573 * @param xmlGuessEnc XML Guess encoding 574 * @param xmlEnc XML encoding 575 * @param lenient indicates if the charset encoding detection should be 576 * relaxed. 577 * @return the HTTP encoding 578 * @throws IOException thrown if there is a problem reading the stream. 579 */ 580 String calculateHttpEncoding(final String httpContentType, 581 final String bomEnc, final String xmlGuessEnc, final String xmlEnc, 582 final boolean lenient) throws IOException { 583 584 // Lenient and has XML encoding 585 if (lenient && xmlEnc != null) { 586 return xmlEnc; 587 } 588 589 // Determine mime/encoding content types from HTTP Content Type 590 final String cTMime = getContentTypeMime(httpContentType); 591 final String cTEnc = getContentTypeEncoding(httpContentType); 592 final boolean appXml = isAppXml(cTMime); 593 final boolean textXml = isTextXml(cTMime); 594 595 // Mime type NOT "application/xml" or "text/xml" 596 if (!appXml && !textXml) { 597 final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 598 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 599 } 600 601 // No content type encoding 602 if (cTEnc == null) { 603 if (appXml) { 604 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); 605 } else { 606 return defaultEncoding == null ? US_ASCII : defaultEncoding; 607 } 608 } 609 610 // UTF-16BE or UTF-16LE content type encoding 611 if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) { 612 if (bomEnc != null) { 613 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 614 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 615 } 616 return cTEnc; 617 } 618 619 // UTF-16 content type encoding 620 if (cTEnc.equals(UTF_16)) { 621 if (bomEnc != null && bomEnc.startsWith(UTF_16)) { 622 return bomEnc; 623 } 624 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 625 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 626 } 627 628 // UTF-32BE or UTF-132E content type encoding 629 if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) { 630 if (bomEnc != null) { 631 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 632 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 633 } 634 return cTEnc; 635 } 636 637 // UTF-32 content type encoding 638 if (cTEnc.equals(UTF_32)) { 639 if (bomEnc != null && bomEnc.startsWith(UTF_32)) { 640 return bomEnc; 641 } 642 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 643 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 644 } 645 646 return cTEnc; 647 } 648 649 /** 650 * Returns MIME type or NULL if httpContentType is NULL. 651 * 652 * @param httpContentType the HTTP content type 653 * @return The mime content type 654 */ 655 static String getContentTypeMime(final String httpContentType) { 656 String mime = null; 657 if (httpContentType != null) { 658 final int i = httpContentType.indexOf(";"); 659 if (i >= 0) { 660 mime = httpContentType.substring(0, i); 661 } else { 662 mime = httpContentType; 663 } 664 mime = mime.trim(); 665 } 666 return mime; 667 } 668 669 private static final Pattern CHARSET_PATTERN = Pattern 670 .compile("charset=[\"']?([.[^; \"']]*)[\"']?"); 671 672 /** 673 * Returns charset parameter value, NULL if not present, NULL if 674 * httpContentType is NULL. 675 * 676 * @param httpContentType the HTTP content type 677 * @return The content type encoding (upcased) 678 */ 679 static String getContentTypeEncoding(final String httpContentType) { 680 String encoding = null; 681 if (httpContentType != null) { 682 final int i = httpContentType.indexOf(";"); 683 if (i > -1) { 684 final String postMime = httpContentType.substring(i + 1); 685 final Matcher m = CHARSET_PATTERN.matcher(postMime); 686 encoding = m.find() ? m.group(1) : null; 687 encoding = encoding != null ? encoding.toUpperCase(Locale.US) : null; 688 } 689 } 690 return encoding; 691 } 692 693 public static final Pattern ENCODING_PATTERN = Pattern.compile( 694 "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", 695 Pattern.MULTILINE); 696 697 /** 698 * Returns the encoding declared in the <?xml encoding=...?>, NULL if none. 699 * 700 * @param is InputStream to create the reader from. 701 * @param guessedEnc guessed encoding 702 * @return the encoding declared in the <?xml encoding=...?> 703 * @throws IOException thrown if there is a problem reading the stream. 704 */ 705 private static String getXmlProlog(final InputStream is, final String guessedEnc) 706 throws IOException { 707 String encoding = null; 708 if (guessedEnc != null) { 709 final byte[] bytes = new byte[BUFFER_SIZE]; 710 is.mark(BUFFER_SIZE); 711 int offset = 0; 712 int max = BUFFER_SIZE; 713 int c = is.read(bytes, offset, max); 714 int firstGT = -1; 715 String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning) 716 while (c != -1 && firstGT == -1 && offset < BUFFER_SIZE) { 717 offset += c; 718 max -= c; 719 c = is.read(bytes, offset, max); 720 xmlProlog = new String(bytes, 0, offset, guessedEnc); 721 firstGT = xmlProlog.indexOf('>'); 722 } 723 if (firstGT == -1) { 724 if (c == -1) { 725 throw new IOException("Unexpected end of XML stream"); 726 } else { 727 throw new IOException( 728 "XML prolog or ROOT element not found on first " 729 + offset + " bytes"); 730 } 731 } 732 final int bytesRead = offset; 733 if (bytesRead > 0) { 734 is.reset(); 735 final BufferedReader bReader = new BufferedReader(new StringReader( 736 xmlProlog.substring(0, firstGT + 1))); 737 final StringBuffer prolog = new StringBuffer(); 738 String line = bReader.readLine(); 739 while (line != null) { 740 prolog.append(line); 741 line = bReader.readLine(); 742 } 743 final Matcher m = ENCODING_PATTERN.matcher(prolog); 744 if (m.find()) { 745 encoding = m.group(1).toUpperCase(); 746 encoding = encoding.substring(1, encoding.length() - 1); 747 } 748 } 749 } 750 return encoding; 751 } 752 753 /** 754 * Indicates if the MIME type belongs to the APPLICATION XML family. 755 * 756 * @param mime The mime type 757 * @return true if the mime type belongs to the APPLICATION XML family, 758 * otherwise false 759 */ 760 static boolean isAppXml(final String mime) { 761 return mime != null && 762 (mime.equals("application/xml") || 763 mime.equals("application/xml-dtd") || 764 mime.equals("application/xml-external-parsed-entity") || 765 mime.startsWith("application/") && mime.endsWith("+xml")); 766 } 767 768 /** 769 * Indicates if the MIME type belongs to the TEXT XML family. 770 * 771 * @param mime The mime type 772 * @return true if the mime type belongs to the TEXT XML family, 773 * otherwise false 774 */ 775 static boolean isTextXml(final String mime) { 776 return mime != null && 777 (mime.equals("text/xml") || 778 mime.equals("text/xml-external-parsed-entity") || 779 mime.startsWith("text/") && mime.endsWith("+xml")); 780 } 781 782 private static final String RAW_EX_1 = 783 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch"; 784 785 private static final String RAW_EX_2 = 786 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM"; 787 788 private static final String HTTP_EX_1 = 789 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL"; 790 791 private static final String HTTP_EX_2 = 792 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch"; 793 794 private static final String HTTP_EX_3 = 795 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME"; 796 797}