001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.io.input; 018 019 import java.io.BufferedInputStream; 020 import java.io.BufferedReader; 021 import java.io.File; 022 import java.io.FileInputStream; 023 import java.io.IOException; 024 import java.io.InputStream; 025 import java.io.InputStreamReader; 026 import java.io.Reader; 027 import java.io.StringReader; 028 import java.net.HttpURLConnection; 029 import java.net.URL; 030 import java.net.URLConnection; 031 import java.text.MessageFormat; 032 import java.util.Locale; 033 import java.util.regex.Matcher; 034 import java.util.regex.Pattern; 035 036 import org.apache.commons.io.ByteOrderMark; 037 038 /** 039 * Character stream that handles all the necessary Voodo to figure out the 040 * charset encoding of the XML document within the stream. 041 * <p> 042 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. 043 * This one IS a character stream. 044 * <p> 045 * All this has to be done without consuming characters from the stream, if not 046 * the XML parser will not recognized the document as a valid XML. This is not 047 * 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers 048 * right now, XmlStreamReader handles it and things work in all parsers). 049 * <p> 050 * The XmlStreamReader class handles the charset encoding of XML documents in 051 * Files, raw streams and HTTP streams by offering a wide set of constructors. 052 * <p> 053 * By default the charset encoding detection is lenient, the constructor with 054 * the lenient flag can be used for an script (following HTTP MIME and XML 055 * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a 056 * href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> 057 * Determining the character encoding of a feed</a>. 058 * <p> 059 * Originally developed for <a href="http://rome.dev.java.net">ROME</a> under 060 * Apache License 2.0. 061 * 062 * @version $Id: XmlStreamReader.java 1346400 2012-06-05 14:48:01Z ggregory $ 063 * @see org.apache.commons.io.output.XmlStreamWriter 064 * @since 2.0 065 */ 066 public class XmlStreamReader extends Reader { 067 private static final int BUFFER_SIZE = 4096; 068 069 private static final String UTF_8 = "UTF-8"; 070 071 private static final String US_ASCII = "US-ASCII"; 072 073 private static final String UTF_16BE = "UTF-16BE"; 074 075 private static final String UTF_16LE = "UTF-16LE"; 076 077 private static final String UTF_32BE = "UTF-32BE"; 078 079 private static final String UTF_32LE = "UTF-32LE"; 080 081 private static final String UTF_16 = "UTF-16"; 082 083 private static final String UTF_32 = "UTF-32"; 084 085 private static final String EBCDIC = "CP1047"; 086 087 private static final ByteOrderMark[] BOMS = new ByteOrderMark[] { 088 ByteOrderMark.UTF_8, 089 ByteOrderMark.UTF_16BE, 090 ByteOrderMark.UTF_16LE, 091 ByteOrderMark.UTF_32BE, 092 ByteOrderMark.UTF_32LE 093 }; 094 095 // UTF_16LE and UTF_32LE have the same two starting BOM bytes. 096 private static final ByteOrderMark[] XML_GUESS_BYTES = new ByteOrderMark[] { 097 new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D), 098 new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), 099 new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00), 100 new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C, 101 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D), 102 new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00, 103 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00), 104 new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) 105 }; 106 107 private final Reader reader; 108 109 private final String encoding; 110 111 private final String defaultEncoding; 112 113 /** 114 * Returns the default encoding to use if none is set in HTTP content-type, 115 * XML prolog and the rules based on content-type are not adequate. 116 * <p> 117 * If it is NULL the content-type based rules are used. 118 * 119 * @return the default encoding to use. 120 */ 121 public String getDefaultEncoding() { 122 return defaultEncoding; 123 } 124 125 /** 126 * Creates a Reader for a File. 127 * <p> 128 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, 129 * if this is also missing defaults to UTF-8. 130 * <p> 131 * It does a lenient charset encoding detection, check the constructor with 132 * the lenient parameter for details. 133 * 134 * @param file File to create a Reader from. 135 * @throws IOException thrown if there is a problem reading the file. 136 */ 137 public XmlStreamReader(File file) throws IOException { 138 this(new FileInputStream(file)); 139 } 140 141 /** 142 * Creates a Reader for a raw InputStream. 143 * <p> 144 * It follows the same logic used for files. 145 * <p> 146 * It does a lenient charset encoding detection, check the constructor with 147 * the lenient parameter for details. 148 * 149 * @param is InputStream to create a Reader from. 150 * @throws IOException thrown if there is a problem reading the stream. 151 */ 152 public XmlStreamReader(InputStream is) throws IOException { 153 this(is, true); 154 } 155 156 /** 157 * Creates a Reader for a raw InputStream. 158 * <p> 159 * It follows the same logic used for files. 160 * <p> 161 * If lenient detection is indicated and the detection above fails as per 162 * specifications it then attempts the following: 163 * <p> 164 * If the content type was 'text/html' it replaces it with 'text/xml' and 165 * tries the detection again. 166 * <p> 167 * Else if the XML prolog had a charset encoding that encoding is used. 168 * <p> 169 * Else if the content type had a charset encoding that encoding is used. 170 * <p> 171 * Else 'UTF-8' is used. 172 * <p> 173 * If lenient detection is indicated an XmlStreamReaderException is never 174 * thrown. 175 * 176 * @param is InputStream to create a Reader from. 177 * @param lenient indicates if the charset encoding detection should be 178 * relaxed. 179 * @throws IOException thrown if there is a problem reading the stream. 180 * @throws XmlStreamReaderException thrown if the charset encoding could not 181 * be determined according to the specs. 182 */ 183 public XmlStreamReader(InputStream is, boolean lenient) throws IOException { 184 this(is, lenient, null); 185 } 186 187 /** 188 * Creates a Reader for a raw InputStream. 189 * <p> 190 * It follows the same logic used for files. 191 * <p> 192 * If lenient detection is indicated and the detection above fails as per 193 * specifications it then attempts the following: 194 * <p> 195 * If the content type was 'text/html' it replaces it with 'text/xml' and 196 * tries the detection again. 197 * <p> 198 * Else if the XML prolog had a charset encoding that encoding is used. 199 * <p> 200 * Else if the content type had a charset encoding that encoding is used. 201 * <p> 202 * Else 'UTF-8' is used. 203 * <p> 204 * If lenient detection is indicated an XmlStreamReaderException is never 205 * thrown. 206 * 207 * @param is InputStream to create a Reader from. 208 * @param lenient indicates if the charset encoding detection should be 209 * relaxed. 210 * @param defaultEncoding The default encoding 211 * @throws IOException thrown if there is a problem reading the stream. 212 * @throws XmlStreamReaderException thrown if the charset encoding could not 213 * be determined according to the specs. 214 */ 215 public XmlStreamReader(InputStream is, boolean lenient, String defaultEncoding) throws IOException { 216 this.defaultEncoding = defaultEncoding; 217 BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS); 218 BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 219 this.encoding = doRawStream(bom, pis, lenient); 220 this.reader = new InputStreamReader(pis, encoding); 221 } 222 223 /** 224 * Creates a Reader using the InputStream of a URL. 225 * <p> 226 * If the URL is not of type HTTP and there is not 'content-type' header in 227 * the fetched data it uses the same logic used for Files. 228 * <p> 229 * If the URL is a HTTP Url or there is a 'content-type' header in the 230 * fetched data it uses the same logic used for an InputStream with 231 * content-type. 232 * <p> 233 * It does a lenient charset encoding detection, check the constructor with 234 * the lenient parameter for details. 235 * 236 * @param url URL to create a Reader from. 237 * @throws IOException thrown if there is a problem reading the stream of 238 * the URL. 239 */ 240 public XmlStreamReader(URL url) throws IOException { 241 this(url.openConnection(), null); 242 } 243 244 /** 245 * Creates a Reader using the InputStream of a URLConnection. 246 * <p> 247 * If the URLConnection is not of type HttpURLConnection and there is not 248 * 'content-type' header in the fetched data it uses the same logic used for 249 * files. 250 * <p> 251 * If the URLConnection is a HTTP Url or there is a 'content-type' header in 252 * the fetched data it uses the same logic used for an InputStream with 253 * content-type. 254 * <p> 255 * It does a lenient charset encoding detection, check the constructor with 256 * the lenient parameter for details. 257 * 258 * @param conn URLConnection to create a Reader from. 259 * @param defaultEncoding The default encoding 260 * @throws IOException thrown if there is a problem reading the stream of 261 * the URLConnection. 262 */ 263 public XmlStreamReader(URLConnection conn, String defaultEncoding) throws IOException { 264 this.defaultEncoding = defaultEncoding; 265 boolean lenient = true; 266 String contentType = conn.getContentType(); 267 InputStream is = conn.getInputStream(); 268 BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS); 269 BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 270 if (conn instanceof HttpURLConnection || contentType != null) { 271 this.encoding = doHttpStream(bom, pis, contentType, lenient); 272 } else { 273 this.encoding = doRawStream(bom, pis, lenient); 274 } 275 this.reader = new InputStreamReader(pis, encoding); 276 } 277 278 /** 279 * Creates a Reader using an InputStream an the associated content-type 280 * header. 281 * <p> 282 * First it checks if the stream has BOM. If there is not BOM checks the 283 * content-type encoding. If there is not content-type encoding checks the 284 * XML prolog encoding. If there is not XML prolog encoding uses the default 285 * encoding mandated by the content-type MIME type. 286 * <p> 287 * It does a lenient charset encoding detection, check the constructor with 288 * the lenient parameter for details. 289 * 290 * @param is InputStream to create the reader from. 291 * @param httpContentType content-type header to use for the resolution of 292 * the charset encoding. 293 * @throws IOException thrown if there is a problem reading the file. 294 */ 295 public XmlStreamReader(InputStream is, String httpContentType) 296 throws IOException { 297 this(is, httpContentType, true); 298 } 299 300 /** 301 * Creates a Reader using an InputStream an the associated content-type 302 * header. This constructor is lenient regarding the encoding detection. 303 * <p> 304 * First it checks if the stream has BOM. If there is not BOM checks the 305 * content-type encoding. If there is not content-type encoding checks the 306 * XML prolog encoding. If there is not XML prolog encoding uses the default 307 * encoding mandated by the content-type MIME type. 308 * <p> 309 * If lenient detection is indicated and the detection above fails as per 310 * specifications it then attempts the following: 311 * <p> 312 * If the content type was 'text/html' it replaces it with 'text/xml' and 313 * tries the detection again. 314 * <p> 315 * Else if the XML prolog had a charset encoding that encoding is used. 316 * <p> 317 * Else if the content type had a charset encoding that encoding is used. 318 * <p> 319 * Else 'UTF-8' is used. 320 * <p> 321 * If lenient detection is indicated an XmlStreamReaderException is never 322 * thrown. 323 * 324 * @param is InputStream to create the reader from. 325 * @param httpContentType content-type header to use for the resolution of 326 * the charset encoding. 327 * @param lenient indicates if the charset encoding detection should be 328 * relaxed. 329 * @param defaultEncoding The default encoding 330 * @throws IOException thrown if there is a problem reading the file. 331 * @throws XmlStreamReaderException thrown if the charset encoding could not 332 * be determined according to the specs. 333 */ 334 public XmlStreamReader(InputStream is, String httpContentType, 335 boolean lenient, String defaultEncoding) throws IOException { 336 this.defaultEncoding = defaultEncoding; 337 BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS); 338 BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 339 this.encoding = doHttpStream(bom, pis, httpContentType, lenient); 340 this.reader = new InputStreamReader(pis, encoding); 341 } 342 343 /** 344 * Creates a Reader using an InputStream an the associated content-type 345 * header. This constructor is lenient regarding the encoding detection. 346 * <p> 347 * First it checks if the stream has BOM. If there is not BOM checks the 348 * content-type encoding. If there is not content-type encoding checks the 349 * XML prolog encoding. If there is not XML prolog encoding uses the default 350 * encoding mandated by the content-type MIME type. 351 * <p> 352 * If lenient detection is indicated and the detection above fails as per 353 * specifications it then attempts the following: 354 * <p> 355 * If the content type was 'text/html' it replaces it with 'text/xml' and 356 * tries the detection again. 357 * <p> 358 * Else if the XML prolog had a charset encoding that encoding is used. 359 * <p> 360 * Else if the content type had a charset encoding that encoding is used. 361 * <p> 362 * Else 'UTF-8' is used. 363 * <p> 364 * If lenient detection is indicated an XmlStreamReaderException is never 365 * thrown. 366 * 367 * @param is InputStream to create the reader from. 368 * @param httpContentType content-type header to use for the resolution of 369 * the charset encoding. 370 * @param lenient indicates if the charset encoding detection should be 371 * relaxed. 372 * @throws IOException thrown if there is a problem reading the file. 373 * @throws XmlStreamReaderException thrown if the charset encoding could not 374 * be determined according to the specs. 375 */ 376 public XmlStreamReader(InputStream is, String httpContentType, 377 boolean lenient) throws IOException { 378 this(is, httpContentType, lenient, null); 379 } 380 381 /** 382 * Returns the charset encoding of the XmlStreamReader. 383 * 384 * @return charset encoding. 385 */ 386 public String getEncoding() { 387 return encoding; 388 } 389 390 /** 391 * Invokes the underlying reader's <code>read(char[], int, int)</code> method. 392 * @param buf the buffer to read the characters into 393 * @param offset The start offset 394 * @param len The number of bytes to read 395 * @return the number of characters read or -1 if the end of stream 396 * @throws IOException if an I/O error occurs 397 */ 398 @Override 399 public int read(char[] buf, int offset, int len) throws IOException { 400 return reader.read(buf, offset, len); 401 } 402 403 /** 404 * Closes the XmlStreamReader stream. 405 * 406 * @throws IOException thrown if there was a problem closing the stream. 407 */ 408 @Override 409 public void close() throws IOException { 410 reader.close(); 411 } 412 413 /** 414 * Process the raw stream. 415 * 416 * @param bom BOMInputStream to detect byte order marks 417 * @param pis BOMInputStream to guess XML encoding 418 * @param lenient indicates if the charset encoding detection should be 419 * relaxed. 420 * @return the encoding to be used 421 * @throws IOException thrown if there is a problem reading the stream. 422 */ 423 private String doRawStream(BOMInputStream bom, BOMInputStream pis, boolean lenient) 424 throws IOException { 425 String bomEnc = bom.getBOMCharsetName(); 426 String xmlGuessEnc = pis.getBOMCharsetName(); 427 String xmlEnc = getXmlProlog(pis, xmlGuessEnc); 428 try { 429 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); 430 } catch (XmlStreamReaderException ex) { 431 if (lenient) { 432 return doLenientDetection(null, ex); 433 } else { 434 throw ex; 435 } 436 } 437 } 438 439 /** 440 * Process a HTTP stream. 441 * 442 * @param bom BOMInputStream to detect byte order marks 443 * @param pis BOMInputStream to guess XML encoding 444 * @param httpContentType The HTTP content type 445 * @param lenient indicates if the charset encoding detection should be 446 * relaxed. 447 * @return the encoding to be used 448 * @throws IOException thrown if there is a problem reading the stream. 449 */ 450 private String doHttpStream(BOMInputStream bom, BOMInputStream pis, String httpContentType, 451 boolean lenient) throws IOException { 452 String bomEnc = bom.getBOMCharsetName(); 453 String xmlGuessEnc = pis.getBOMCharsetName(); 454 String xmlEnc = getXmlProlog(pis, xmlGuessEnc); 455 try { 456 return calculateHttpEncoding(httpContentType, bomEnc, 457 xmlGuessEnc, xmlEnc, lenient); 458 } catch (XmlStreamReaderException ex) { 459 if (lenient) { 460 return doLenientDetection(httpContentType, ex); 461 } else { 462 throw ex; 463 } 464 } 465 } 466 467 /** 468 * Do lenient detection. 469 * 470 * @param httpContentType content-type header to use for the resolution of 471 * the charset encoding. 472 * @param ex The thrown exception 473 * @return the encoding 474 * @throws IOException thrown if there is a problem reading the stream. 475 */ 476 private String doLenientDetection(String httpContentType, 477 XmlStreamReaderException ex) throws IOException { 478 if (httpContentType != null && httpContentType.startsWith("text/html")) { 479 httpContentType = httpContentType.substring("text/html".length()); 480 httpContentType = "text/xml" + httpContentType; 481 try { 482 return calculateHttpEncoding(httpContentType, ex.getBomEncoding(), 483 ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true); 484 } catch (XmlStreamReaderException ex2) { 485 ex = ex2; 486 } 487 } 488 String encoding = ex.getXmlEncoding(); 489 if (encoding == null) { 490 encoding = ex.getContentTypeEncoding(); 491 } 492 if (encoding == null) { 493 encoding = defaultEncoding == null ? UTF_8 : defaultEncoding; 494 } 495 return encoding; 496 } 497 498 /** 499 * Calculate the raw encoding. 500 * 501 * @param bomEnc BOM encoding 502 * @param xmlGuessEnc XML Guess encoding 503 * @param xmlEnc XML encoding 504 * @return the raw encoding 505 * @throws IOException thrown if there is a problem reading the stream. 506 */ 507 String calculateRawEncoding(String bomEnc, String xmlGuessEnc, 508 String xmlEnc) throws IOException { 509 510 // BOM is Null 511 if (bomEnc == null) { 512 if (xmlGuessEnc == null || xmlEnc == null) { 513 return defaultEncoding == null ? UTF_8 : defaultEncoding; 514 } 515 if (xmlEnc.equals(UTF_16) && 516 (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) { 517 return xmlGuessEnc; 518 } 519 return xmlEnc; 520 } 521 522 // BOM is UTF-8 523 if (bomEnc.equals(UTF_8)) { 524 if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) { 525 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc }); 526 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 527 } 528 if (xmlEnc != null && !xmlEnc.equals(UTF_8)) { 529 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc }); 530 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 531 } 532 return bomEnc; 533 } 534 535 // BOM is UTF-16BE or UTF-16LE 536 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) { 537 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) { 538 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc }); 539 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 540 } 541 if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) { 542 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc }); 543 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 544 } 545 return bomEnc; 546 } 547 548 // BOM is UTF-32BE or UTF-32LE 549 if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) { 550 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) { 551 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc }); 552 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 553 } 554 if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) { 555 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc }); 556 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 557 } 558 return bomEnc; 559 } 560 561 // BOM is something else 562 String msg = MessageFormat.format(RAW_EX_2, new Object[] { bomEnc, xmlGuessEnc, xmlEnc }); 563 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 564 } 565 566 567 /** 568 * Calculate the HTTP encoding. 569 * 570 * @param httpContentType The HTTP content type 571 * @param bomEnc BOM encoding 572 * @param xmlGuessEnc XML Guess encoding 573 * @param xmlEnc XML encoding 574 * @param lenient indicates if the charset encoding detection should be 575 * relaxed. 576 * @return the HTTP encoding 577 * @throws IOException thrown if there is a problem reading the stream. 578 */ 579 String calculateHttpEncoding(String httpContentType, 580 String bomEnc, String xmlGuessEnc, String xmlEnc, 581 boolean lenient) throws IOException { 582 583 // Lenient and has XML encoding 584 if (lenient && xmlEnc != null) { 585 return xmlEnc; 586 } 587 588 // Determine mime/encoding content types from HTTP Content Type 589 String cTMime = getContentTypeMime(httpContentType); 590 String cTEnc = getContentTypeEncoding(httpContentType); 591 boolean appXml = isAppXml(cTMime); 592 boolean textXml = isTextXml(cTMime); 593 594 // Mime type NOT "application/xml" or "text/xml" 595 if (!appXml && !textXml) { 596 String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 597 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 598 } 599 600 // No content type encoding 601 if (cTEnc == null) { 602 if (appXml) { 603 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); 604 } else { 605 return defaultEncoding == null ? US_ASCII : defaultEncoding; 606 } 607 } 608 609 // UTF-16BE or UTF-16LE content type encoding 610 if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) { 611 if (bomEnc != null) { 612 String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 613 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 614 } 615 return cTEnc; 616 } 617 618 // UTF-16 content type encoding 619 if (cTEnc.equals(UTF_16)) { 620 if (bomEnc != null && bomEnc.startsWith(UTF_16)) { 621 return bomEnc; 622 } 623 String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 624 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 625 } 626 627 // UTF-32BE or UTF-132E content type encoding 628 if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) { 629 if (bomEnc != null) { 630 String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 631 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 632 } 633 return cTEnc; 634 } 635 636 // UTF-32 content type encoding 637 if (cTEnc.equals(UTF_32)) { 638 if (bomEnc != null && bomEnc.startsWith(UTF_32)) { 639 return bomEnc; 640 } 641 String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 642 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 643 } 644 645 return cTEnc; 646 } 647 648 /** 649 * Returns MIME type or NULL if httpContentType is NULL. 650 * 651 * @param httpContentType the HTTP content type 652 * @return The mime content type 653 */ 654 static String getContentTypeMime(String httpContentType) { 655 String mime = null; 656 if (httpContentType != null) { 657 int i = httpContentType.indexOf(";"); 658 if (i >= 0) { 659 mime = httpContentType.substring(0, i); 660 } else { 661 mime = httpContentType; 662 } 663 mime = mime.trim(); 664 } 665 return mime; 666 } 667 668 private static final Pattern CHARSET_PATTERN = Pattern 669 .compile("charset=[\"']?([.[^; \"']]*)[\"']?"); 670 671 /** 672 * Returns charset parameter value, NULL if not present, NULL if 673 * httpContentType is NULL. 674 * 675 * @param httpContentType the HTTP content type 676 * @return The content type encoding (upcased) 677 */ 678 static String getContentTypeEncoding(String httpContentType) { 679 String encoding = null; 680 if (httpContentType != null) { 681 int i = httpContentType.indexOf(";"); 682 if (i > -1) { 683 String postMime = httpContentType.substring(i + 1); 684 Matcher m = CHARSET_PATTERN.matcher(postMime); 685 encoding = m.find() ? m.group(1) : null; 686 encoding = encoding != null ? encoding.toUpperCase(Locale.US) : null; 687 } 688 } 689 return encoding; 690 } 691 692 public static final Pattern ENCODING_PATTERN = Pattern.compile( 693 "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", 694 Pattern.MULTILINE); 695 696 /** 697 * Returns the encoding declared in the <?xml encoding=...?>, NULL if none. 698 * 699 * @param is InputStream to create the reader from. 700 * @param guessedEnc guessed encoding 701 * @return the encoding declared in the <?xml encoding=...?> 702 * @throws IOException thrown if there is a problem reading the stream. 703 */ 704 private static String getXmlProlog(InputStream is, String guessedEnc) 705 throws IOException { 706 String encoding = null; 707 if (guessedEnc != null) { 708 byte[] bytes = new byte[BUFFER_SIZE]; 709 is.mark(BUFFER_SIZE); 710 int offset = 0; 711 int max = BUFFER_SIZE; 712 int c = is.read(bytes, offset, max); 713 int firstGT = -1; 714 String xmlProlog = null; 715 while (c != -1 && firstGT == -1 && offset < BUFFER_SIZE) { 716 offset += c; 717 max -= c; 718 c = is.read(bytes, offset, max); 719 xmlProlog = new String(bytes, 0, offset, guessedEnc); 720 firstGT = xmlProlog.indexOf('>'); 721 } 722 if (firstGT == -1) { 723 if (c == -1) { 724 throw new IOException("Unexpected end of XML stream"); 725 } else { 726 throw new IOException( 727 "XML prolog or ROOT element not found on first " 728 + offset + " bytes"); 729 } 730 } 731 int bytesRead = offset; 732 if (bytesRead > 0) { 733 is.reset(); 734 BufferedReader bReader = new BufferedReader(new StringReader( 735 xmlProlog.substring(0, firstGT + 1))); 736 StringBuffer prolog = new StringBuffer(); 737 String line = bReader.readLine(); 738 while (line != null) { 739 prolog.append(line); 740 line = bReader.readLine(); 741 } 742 Matcher m = ENCODING_PATTERN.matcher(prolog); 743 if (m.find()) { 744 encoding = m.group(1).toUpperCase(); 745 encoding = encoding.substring(1, encoding.length() - 1); 746 } 747 } 748 } 749 return encoding; 750 } 751 752 /** 753 * Indicates if the MIME type belongs to the APPLICATION XML family. 754 * 755 * @param mime The mime type 756 * @return true if the mime type belongs to the APPLICATION XML family, 757 * otherwise false 758 */ 759 static boolean isAppXml(String mime) { 760 return mime != null && 761 (mime.equals("application/xml") || 762 mime.equals("application/xml-dtd") || 763 mime.equals("application/xml-external-parsed-entity") || 764 mime.startsWith("application/") && mime.endsWith("+xml")); 765 } 766 767 /** 768 * Indicates if the MIME type belongs to the TEXT XML family. 769 * 770 * @param mime The mime type 771 * @return true if the mime type belongs to the TEXT XML family, 772 * otherwise false 773 */ 774 static boolean isTextXml(String mime) { 775 return mime != null && 776 (mime.equals("text/xml") || 777 mime.equals("text/xml-external-parsed-entity") || 778 mime.startsWith("text/") && mime.endsWith("+xml")); 779 } 780 781 private static final String RAW_EX_1 = 782 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch"; 783 784 private static final String RAW_EX_2 = 785 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM"; 786 787 private static final String HTTP_EX_1 = 788 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL"; 789 790 private static final String HTTP_EX_2 = 791 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch"; 792 793 private static final String HTTP_EX_3 = 794 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME"; 795 796 }